From e7c1c00cf3fafde7496e31cacd718a51d0e7d70c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 May 2020 19:21:54 -0700 Subject: [PATCH 01/43] pstore: Drop useless try_module_get() for backend There is no reason to be doing a module get/put in pstore_register(), since the module calling pstore_register() cannot be unloaded since it hasn't finished its initialization. Remove it so there is no confusion about how registration ordering works. Link: https://lore.kernel.org/lkml/20200506152114.50375-2-keescook@chromium.org/ Signed-off-by: Kees Cook --- fs/pstore/platform.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 408277ee3cdb..44f8b9742263 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -555,8 +555,6 @@ out: */ int pstore_register(struct pstore_info *psi) { - struct module *owner = psi->owner; - if (backend && strcmp(backend, psi->name)) { pr_warn("ignoring unexpected backend '%s'\n", psi->name); return -EPERM; @@ -591,10 +589,6 @@ int pstore_register(struct pstore_info *psi) sema_init(&psinfo->buf_lock, 1); spin_unlock(&pstore_lock); - if (owner && !try_module_get(owner)) { - psinfo = NULL; - return -EINVAL; - } if (psi->flags & PSTORE_FLAGS_DMESG) allocate_buf_for_compression(); @@ -626,8 +620,6 @@ int pstore_register(struct pstore_info *psi) pr_info("Registered %s as persistent store backend\n", psi->name); - module_put(owner); - return 0; } EXPORT_SYMBOL_GPL(pstore_register); From c30b20cd96a7a08e77c12cb3326c6fd801f7fe87 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 5 May 2020 23:29:10 -0700 Subject: [PATCH 02/43] pstore: Rename "pstore_lock" to "psinfo_lock" The name "pstore_lock" sounds very global, but it is only supposed to be used for managing changes to "psinfo", so rename it accordingly. Link: https://lore.kernel.org/lkml/20200506152114.50375-3-keescook@chromium.org/ Signed-off-by: Kees Cook --- fs/pstore/platform.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 44f8b9742263..347b6c07f4cf 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -69,10 +69,10 @@ static void pstore_dowork(struct work_struct *); static DECLARE_WORK(pstore_work, pstore_dowork); /* - * pstore_lock just protects "psinfo" during + * psinfo_lock just protects "psinfo" during * calls to pstore_register() */ -static DEFINE_SPINLOCK(pstore_lock); +static DEFINE_SPINLOCK(psinfo_lock); struct pstore_info *psinfo; static char *backend; @@ -574,11 +574,11 @@ int pstore_register(struct pstore_info *psi) return -EINVAL; } - spin_lock(&pstore_lock); + spin_lock(&psinfo_lock); if (psinfo) { pr_warn("backend '%s' already loaded: ignoring '%s'\n", psinfo->name, psi->name); - spin_unlock(&pstore_lock); + spin_unlock(&psinfo_lock); return -EBUSY; } @@ -587,7 +587,7 @@ int pstore_register(struct pstore_info *psi) psinfo = psi; mutex_init(&psinfo->read_mutex); sema_init(&psinfo->buf_lock, 1); - spin_unlock(&pstore_lock); + spin_unlock(&psinfo_lock); if (psi->flags & PSTORE_FLAGS_DMESG) From cab12fd049380b1d0c3ac9f534407a8cc0ac4bba Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 May 2020 19:31:36 -0700 Subject: [PATCH 03/43] pstore: Convert "psinfo" locking to mutex Currently pstore can only have a single backend attached at a time, and it tracks the active backend via "psinfo", under a lock. The locking for this does not need to be a spinlock, and in order to avoid may_sleep() issues during future changes to pstore_unregister(), switch to a mutex instead. Link: https://lore.kernel.org/lkml/20200506152114.50375-4-keescook@chromium.org/ Signed-off-by: Kees Cook --- fs/pstore/platform.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 347b6c07f4cf..d0ce22237589 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -72,7 +72,7 @@ static DECLARE_WORK(pstore_work, pstore_dowork); * psinfo_lock just protects "psinfo" during * calls to pstore_register() */ -static DEFINE_SPINLOCK(psinfo_lock); +static DEFINE_MUTEX(psinfo_lock); struct pstore_info *psinfo; static char *backend; @@ -574,11 +574,11 @@ int pstore_register(struct pstore_info *psi) return -EINVAL; } - spin_lock(&psinfo_lock); + mutex_lock(&psinfo_lock); if (psinfo) { pr_warn("backend '%s' already loaded: ignoring '%s'\n", psinfo->name, psi->name); - spin_unlock(&psinfo_lock); + mutex_unlock(&psinfo_lock); return -EBUSY; } @@ -587,7 +587,7 @@ int pstore_register(struct pstore_info *psi) psinfo = psi; mutex_init(&psinfo->read_mutex); sema_init(&psinfo->buf_lock, 1); - spin_unlock(&psinfo_lock); + mutex_unlock(&psinfo_lock); if (psi->flags & PSTORE_FLAGS_DMESG) From 47af61ffb19baa8df2b109582f87f7622fbe85cc Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 5 May 2020 23:32:40 -0700 Subject: [PATCH 04/43] pstore: Rename "allpstore" to "records_list" The name "allpstore" doesn't carry much meaning, so rename it to what it actually is: the list of all records present in the filesystem. The lock is also renamed accordingly. Link: https://lore.kernel.org/lkml/20200506152114.50375-5-keescook@chromium.org/ Signed-off-by: Kees Cook --- fs/pstore/inode.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index d99b5d39aa90..5cc09cb315f9 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -29,8 +29,8 @@ #define PSTORE_NAMELEN 64 -static DEFINE_SPINLOCK(allpstore_lock); -static LIST_HEAD(allpstore); +static DEFINE_SPINLOCK(records_list_lock); +static LIST_HEAD(records_list); struct pstore_private { struct list_head list; @@ -196,9 +196,9 @@ static void pstore_evict_inode(struct inode *inode) clear_inode(inode); if (p) { - spin_lock_irqsave(&allpstore_lock, flags); + spin_lock_irqsave(&records_list_lock, flags); list_del(&p->list); - spin_unlock_irqrestore(&allpstore_lock, flags); + spin_unlock_irqrestore(&records_list_lock, flags); free_pstore_private(p); } } @@ -302,8 +302,8 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) WARN_ON(!inode_is_locked(d_inode(root))); - spin_lock_irqsave(&allpstore_lock, flags); - list_for_each_entry(pos, &allpstore, list) { + spin_lock_irqsave(&records_list_lock, flags); + list_for_each_entry(pos, &records_list, list) { if (pos->record->type == record->type && pos->record->id == record->id && pos->record->psi == record->psi) { @@ -311,7 +311,7 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) break; } } - spin_unlock_irqrestore(&allpstore_lock, flags); + spin_unlock_irqrestore(&records_list_lock, flags); if (rc) return rc; @@ -343,9 +343,9 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) d_add(dentry, inode); - spin_lock_irqsave(&allpstore_lock, flags); - list_add(&private->list, &allpstore); - spin_unlock_irqrestore(&allpstore_lock, flags); + spin_lock_irqsave(&records_list_lock, flags); + list_add(&private->list, &records_list); + spin_unlock_irqrestore(&records_list_lock, flags); return 0; From db23491c77207ef6bec2b232238710de4755db6a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 May 2020 19:07:04 -0700 Subject: [PATCH 05/43] pstore: Convert "records_list" locking to mutex The pstorefs internal list lock doesn't need to be a spinlock and will create problems when trying to access the list in the subsequent patch that will walk the pstorefs records during pstore_unregister(). Change this to a mutex to avoid may_sleep() warnings when unregistering devices. Link: https://lore.kernel.org/lkml/20200506152114.50375-6-keescook@chromium.org/ Signed-off-by: Kees Cook --- fs/pstore/inode.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 5cc09cb315f9..92ebcc75434f 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -22,14 +22,13 @@ #include #include #include -#include #include #include "internal.h" #define PSTORE_NAMELEN 64 -static DEFINE_SPINLOCK(records_list_lock); +static DEFINE_MUTEX(records_list_lock); static LIST_HEAD(records_list); struct pstore_private { @@ -192,13 +191,12 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) static void pstore_evict_inode(struct inode *inode) { struct pstore_private *p = inode->i_private; - unsigned long flags; clear_inode(inode); if (p) { - spin_lock_irqsave(&records_list_lock, flags); + mutex_lock(&records_list_lock); list_del(&p->list); - spin_unlock_irqrestore(&records_list_lock, flags); + mutex_unlock(&records_list_lock); free_pstore_private(p); } } @@ -297,12 +295,11 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) int rc = 0; char name[PSTORE_NAMELEN]; struct pstore_private *private, *pos; - unsigned long flags; size_t size = record->size + record->ecc_notice_size; WARN_ON(!inode_is_locked(d_inode(root))); - spin_lock_irqsave(&records_list_lock, flags); + mutex_lock(&records_list_lock); list_for_each_entry(pos, &records_list, list) { if (pos->record->type == record->type && pos->record->id == record->id && @@ -311,7 +308,7 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) break; } } - spin_unlock_irqrestore(&records_list_lock, flags); + mutex_unlock(&records_list_lock); if (rc) return rc; @@ -343,9 +340,9 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) d_add(dentry, inode); - spin_lock_irqsave(&records_list_lock, flags); + mutex_lock(&records_list_lock); list_add(&private->list, &records_list); - spin_unlock_irqrestore(&records_list_lock, flags); + mutex_unlock(&records_list_lock); return 0; From 6248a0666c8a408dcc5bd952536274d5bd0f02cb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 May 2020 19:33:54 -0700 Subject: [PATCH 06/43] pstore: Add proper unregister lock checking The pstore backend lock wasn't being used during pstore_unregister(). Add sanity check and locking. Link: https://lore.kernel.org/lkml/20200506152114.50375-7-keescook@chromium.org/ Signed-off-by: Kees Cook --- fs/pstore/platform.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index d0ce22237589..8c0076a1f896 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -69,8 +69,9 @@ static void pstore_dowork(struct work_struct *); static DECLARE_WORK(pstore_work, pstore_dowork); /* - * psinfo_lock just protects "psinfo" during - * calls to pstore_register() + * psinfo_lock protects "psinfo" during calls to + * pstore_register(), pstore_unregister(), and + * the filesystem mount/unmount routines. */ static DEFINE_MUTEX(psinfo_lock); struct pstore_info *psinfo; @@ -587,8 +588,6 @@ int pstore_register(struct pstore_info *psi) psinfo = psi; mutex_init(&psinfo->read_mutex); sema_init(&psinfo->buf_lock, 1); - mutex_unlock(&psinfo_lock); - if (psi->flags & PSTORE_FLAGS_DMESG) allocate_buf_for_compression(); @@ -620,12 +619,25 @@ int pstore_register(struct pstore_info *psi) pr_info("Registered %s as persistent store backend\n", psi->name); + mutex_unlock(&psinfo_lock); return 0; } EXPORT_SYMBOL_GPL(pstore_register); void pstore_unregister(struct pstore_info *psi) { + /* It's okay to unregister nothing. */ + if (!psi) + return; + + mutex_lock(&psinfo_lock); + + /* Only one backend can be registered at a time. */ + if (WARN_ON(psi != psinfo)) { + mutex_unlock(&psinfo_lock); + return; + } + /* Stop timer and make sure all work has finished. */ pstore_update_ms = -1; del_timer_sync(&pstore_timer); @@ -644,6 +656,7 @@ void pstore_unregister(struct pstore_info *psi) psinfo = NULL; backend = NULL; + mutex_unlock(&psinfo_lock); } EXPORT_SYMBOL_GPL(pstore_unregister); From 7a0ad546847a23f92f5e227fa8e4578eaa3a8d0a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 May 2020 19:41:30 -0700 Subject: [PATCH 07/43] pstore: Refactor pstorefs record list removal The "unlink" handling should perform list removal (which can also make sure records don't get double-erased), and the "evict" handling should be responsible only for memory freeing. Link: https://lore.kernel.org/lkml/20200506152114.50375-8-keescook@chromium.org/ Signed-off-by: Kees Cook --- fs/pstore/inode.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 92ebcc75434f..5f08b21b7a46 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -177,10 +177,21 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) { struct pstore_private *p = d_inode(dentry)->i_private; struct pstore_record *record = p->record; + int rc = 0; if (!record->psi->erase) return -EPERM; + /* Make sure we can't race while removing this file. */ + mutex_lock(&records_list_lock); + if (!list_empty(&p->list)) + list_del_init(&p->list); + else + rc = -ENOENT; + mutex_unlock(&records_list_lock); + if (rc) + return rc; + mutex_lock(&record->psi->read_mutex); record->psi->erase(record); mutex_unlock(&record->psi->read_mutex); @@ -193,12 +204,7 @@ static void pstore_evict_inode(struct inode *inode) struct pstore_private *p = inode->i_private; clear_inode(inode); - if (p) { - mutex_lock(&records_list_lock); - list_del(&p->list); - mutex_unlock(&records_list_lock); - free_pstore_private(p); - } + free_pstore_private(p); } static const struct inode_operations pstore_dir_inode_operations = { @@ -417,6 +423,7 @@ static void pstore_kill_sb(struct super_block *sb) { kill_litter_super(sb); pstore_sb = NULL; + INIT_LIST_HEAD(&records_list); } static struct file_system_type pstore_fs_type = { From d2fe97545a1e2d01c0ca0105bdc59002a0d0b130 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 11 May 2020 12:13:56 -0700 Subject: [PATCH 08/43] fscrypt: fix all kerneldoc warnings Fix all kerneldoc warnings in fs/crypto/ and include/linux/fscrypt.h. Most of these were due to missing documentation for function parameters. Detected with: scripts/kernel-doc -v -none fs/crypto/*.{c,h} include/linux/fscrypt.h This cleanup makes it possible to check new patches for kerneldoc warnings without having to filter out all the existing ones. For consistency, also adjust some function "brief descriptions" to include the parentheses and to wrap at 80 characters. (The latter matches the checkpatch expectation.) Link: https://lore.kernel.org/r/20200511191358.53096-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- fs/crypto/crypto.c | 9 +++++-- fs/crypto/fname.c | 52 +++++++++++++++++++++++++++---------- fs/crypto/fscrypt_private.h | 4 +-- fs/crypto/hooks.c | 4 +-- fs/crypto/keysetup.c | 9 ++++--- fs/crypto/policy.c | 19 +++++++++++--- include/linux/fscrypt.h | 19 +++++++++----- 7 files changed, 82 insertions(+), 34 deletions(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 1ecaac7ee3cb..40c2821a341e 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -54,6 +54,7 @@ struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags) /** * fscrypt_free_bounce_page() - free a ciphertext bounce page + * @bounce_page: the bounce page to free, or NULL * * Free a bounce page that was allocated by fscrypt_encrypt_pagecache_blocks(), * or by fscrypt_alloc_bounce_page() directly. @@ -132,7 +133,8 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, } /** - * fscrypt_encrypt_pagecache_blocks() - Encrypt filesystem blocks from a pagecache page + * fscrypt_encrypt_pagecache_blocks() - Encrypt filesystem blocks from a + * pagecache page * @page: The locked pagecache page containing the block(s) to encrypt * @len: Total size of the block(s) to encrypt. Must be a nonzero * multiple of the filesystem's block size. @@ -222,7 +224,8 @@ int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, EXPORT_SYMBOL(fscrypt_encrypt_block_inplace); /** - * fscrypt_decrypt_pagecache_blocks() - Decrypt filesystem blocks in a pagecache page + * fscrypt_decrypt_pagecache_blocks() - Decrypt filesystem blocks in a + * pagecache page * @page: The locked pagecache page containing the block(s) to decrypt * @len: Total size of the block(s) to decrypt. Must be a nonzero * multiple of the filesystem's block size. @@ -346,6 +349,8 @@ void fscrypt_msg(const struct inode *inode, const char *level, /** * fscrypt_init() - Set up for fs encryption. + * + * Return: 0 on success; -errno on failure */ static int __init fscrypt_init(void) { diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 4c212442a8f7..fce3fa29f058 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -18,7 +18,7 @@ #include #include "fscrypt_private.h" -/** +/* * struct fscrypt_nokey_name - identifier for directory entry when key is absent * * When userspace lists an encrypted directory without access to the key, the @@ -105,9 +105,12 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) /** * fscrypt_fname_encrypt() - encrypt a filename - * - * The output buffer must be at least as large as the input buffer. - * Any extra space is filled with NUL padding before encryption. + * @inode: inode of the parent directory (for regular filenames) + * or of the symlink (for symlink targets) + * @iname: the filename to encrypt + * @out: (output) the encrypted filename + * @olen: size of the encrypted filename. It must be at least @iname->len. + * Any extra space is filled with NUL padding before encryption. * * Return: 0 on success, -errno on failure */ @@ -157,8 +160,11 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, /** * fname_decrypt() - decrypt a filename - * - * The caller must have allocated sufficient memory for the @oname string. + * @inode: inode of the parent directory (for regular filenames) + * or of the symlink (for symlink targets) + * @iname: the encrypted filename to decrypt + * @oname: (output) the decrypted filename. The caller must have allocated + * enough space for this, e.g. using fscrypt_fname_alloc_buffer(). * * Return: 0 on success, -errno on failure */ @@ -206,7 +212,10 @@ static const char lookup_table[65] = #define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) /** - * base64_encode() - + * base64_encode() - base64-encode some bytes + * @src: the bytes to encode + * @len: number of bytes to encode + * @dst: (output) the base64-encoded string. Not NUL-terminated. * * Encodes the input string using characters from the set [A-Za-z0-9+,]. * The encoded string is roughly 4/3 times the size of the input string. @@ -272,7 +281,12 @@ bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, } /** - * fscrypt_fname_alloc_buffer - allocate a buffer for presented filenames + * fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames + * @inode: inode of the parent directory (for regular filenames) + * or of the symlink (for symlink targets) + * @max_encrypted_len: maximum length of encrypted filenames the buffer will be + * used to present + * @crypto_str: (output) buffer to allocate * * Allocate a buffer that is large enough to hold any decrypted or encoded * filename (null-terminated), for the given maximum encrypted filename length. @@ -297,9 +311,10 @@ int fscrypt_fname_alloc_buffer(const struct inode *inode, EXPORT_SYMBOL(fscrypt_fname_alloc_buffer); /** - * fscrypt_fname_free_buffer - free the buffer for presented filenames + * fscrypt_fname_free_buffer() - free a buffer for presented filenames + * @crypto_str: the buffer to free * - * Free the buffer allocated by fscrypt_fname_alloc_buffer(). + * Free a buffer that was allocated by fscrypt_fname_alloc_buffer(). */ void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) { @@ -311,10 +326,19 @@ void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) EXPORT_SYMBOL(fscrypt_fname_free_buffer); /** - * fscrypt_fname_disk_to_usr() - converts a filename from disk space to user - * space - * - * The caller must have allocated sufficient memory for the @oname string. + * fscrypt_fname_disk_to_usr() - convert an encrypted filename to + * user-presentable form + * @inode: inode of the parent directory (for regular filenames) + * or of the symlink (for symlink targets) + * @hash: first part of the name's dirhash, if applicable. This only needs to + * be provided if the filename is located in an indexed directory whose + * encryption key may be unavailable. Not needed for symlink targets. + * @minor_hash: second part of the name's dirhash, if applicable + * @iname: encrypted filename to convert. May also be "." or "..", which + * aren't actually encrypted. + * @oname: output buffer for the user-presentable filename. The caller must + * have allocated enough space for this, e.g. using + * fscrypt_fname_alloc_buffer(). * * If the key is available, we'll decrypt the disk name. Otherwise, we'll * encode it for presentation in fscrypt_nokey_name format. diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index dbced2937ec8..f547094100be 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -43,7 +43,7 @@ struct fscrypt_context_v2 { u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; }; -/** +/* * fscrypt_context - the encryption context of an inode * * This is the on-disk equivalent of an fscrypt_policy, stored alongside each @@ -157,7 +157,7 @@ fscrypt_policy_flags(const union fscrypt_policy *policy) BUG(); } -/** +/* * For encrypted symlinks, the ciphertext length is stored at the beginning * of the string in little-endian format. */ diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 5ef861742921..09fb8aa0f2e9 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -10,7 +10,7 @@ #include "fscrypt_private.h" /** - * fscrypt_file_open - prepare to open a possibly-encrypted regular file + * fscrypt_file_open() - prepare to open a possibly-encrypted regular file * @inode: the inode being opened * @filp: the struct file being set up * @@ -262,7 +262,7 @@ err_free_sd: EXPORT_SYMBOL_GPL(__fscrypt_encrypt_symlink); /** - * fscrypt_get_symlink - get the target of an encrypted symlink + * fscrypt_get_symlink() - get the target of an encrypted symlink * @inode: the symlink inode * @caddr: the on-disk contents of the symlink * @max_size: size of @caddr buffer diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 302375e9f719..edc4590c69f0 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -475,7 +475,8 @@ out: EXPORT_SYMBOL(fscrypt_get_encryption_info); /** - * fscrypt_put_encryption_info - free most of an inode's fscrypt data + * fscrypt_put_encryption_info() - free most of an inode's fscrypt data + * @inode: an inode being evicted * * Free the inode's fscrypt_info. Filesystems must call this when the inode is * being evicted. An RCU grace period need not have elapsed yet. @@ -488,7 +489,8 @@ void fscrypt_put_encryption_info(struct inode *inode) EXPORT_SYMBOL(fscrypt_put_encryption_info); /** - * fscrypt_free_inode - free an inode's fscrypt data requiring RCU delay + * fscrypt_free_inode() - free an inode's fscrypt data requiring RCU delay + * @inode: an inode being freed * * Free the inode's cached decrypted symlink target, if any. Filesystems must * call this after an RCU grace period, just before they free the inode. @@ -503,7 +505,8 @@ void fscrypt_free_inode(struct inode *inode) EXPORT_SYMBOL(fscrypt_free_inode); /** - * fscrypt_drop_inode - check whether the inode's master key has been removed + * fscrypt_drop_inode() - check whether the inode's master key has been removed + * @inode: an inode being considered for eviction * * Filesystems supporting fscrypt must call this from their ->drop_inode() * method so that encrypted inodes are evicted as soon as they're no longer in diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 10ccf945020c..0a95537fcef9 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -16,7 +16,9 @@ #include "fscrypt_private.h" /** - * fscrypt_policies_equal - check whether two encryption policies are the same + * fscrypt_policies_equal() - check whether two encryption policies are the same + * @policy1: the first policy + * @policy2: the second policy * * Return: %true if equal, else %false */ @@ -170,7 +172,9 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, } /** - * fscrypt_supported_policy - check whether an encryption policy is supported + * fscrypt_supported_policy() - check whether an encryption policy is supported + * @policy_u: the encryption policy + * @inode: the inode on which the policy will be used * * Given an encryption policy, check whether all its encryption modes and other * settings are supported by this kernel on the given inode. (But we don't @@ -192,7 +196,10 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, } /** - * fscrypt_new_context_from_policy - create a new fscrypt_context from a policy + * fscrypt_new_context_from_policy() - create a new fscrypt_context from + * an fscrypt_policy + * @ctx_u: output context + * @policy_u: input policy * * Create an fscrypt_context for an inode that is being assigned the given * encryption policy. A new nonce is randomly generated. @@ -242,7 +249,11 @@ static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u, } /** - * fscrypt_policy_from_context - convert an fscrypt_context to an fscrypt_policy + * fscrypt_policy_from_context() - convert an fscrypt_context to + * an fscrypt_policy + * @policy_u: output policy + * @ctx_u: input context + * @ctx_size: size of input context in bytes * * Given an fscrypt_context, build the corresponding fscrypt_policy. * diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index e3c2d2a15525..cb2c41f8dfde 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -75,6 +75,7 @@ static inline bool fscrypt_has_encryption_key(const struct inode *inode) /** * fscrypt_needs_contents_encryption() - check whether an inode needs * contents encryption + * @inode: the inode to check * * Return: %true iff the inode is an encrypted regular file and the kernel was * built with fscrypt support. @@ -504,7 +505,7 @@ static inline void fscrypt_set_ops(struct super_block *sb, #endif /* !CONFIG_FS_ENCRYPTION */ /** - * fscrypt_require_key - require an inode's encryption key + * fscrypt_require_key() - require an inode's encryption key * @inode: the inode we need the key for * * If the inode is encrypted, set up its encryption key if not already done. @@ -530,7 +531,8 @@ static inline int fscrypt_require_key(struct inode *inode) } /** - * fscrypt_prepare_link - prepare to link an inode into a possibly-encrypted directory + * fscrypt_prepare_link() - prepare to link an inode into a possibly-encrypted + * directory * @old_dentry: an existing dentry for the inode being linked * @dir: the target directory * @dentry: negative dentry for the target filename @@ -557,7 +559,8 @@ static inline int fscrypt_prepare_link(struct dentry *old_dentry, } /** - * fscrypt_prepare_rename - prepare for a rename between possibly-encrypted directories + * fscrypt_prepare_rename() - prepare for a rename between possibly-encrypted + * directories * @old_dir: source directory * @old_dentry: dentry for source file * @new_dir: target directory @@ -590,7 +593,8 @@ static inline int fscrypt_prepare_rename(struct inode *old_dir, } /** - * fscrypt_prepare_lookup - prepare to lookup a name in a possibly-encrypted directory + * fscrypt_prepare_lookup() - prepare to lookup a name in a possibly-encrypted + * directory * @dir: directory being searched * @dentry: filename being looked up * @fname: (output) the name to use to search the on-disk directory @@ -623,7 +627,8 @@ static inline int fscrypt_prepare_lookup(struct inode *dir, } /** - * fscrypt_prepare_setattr - prepare to change a possibly-encrypted inode's attributes + * fscrypt_prepare_setattr() - prepare to change a possibly-encrypted inode's + * attributes * @dentry: dentry through which the inode is being changed * @attr: attributes to change * @@ -648,7 +653,7 @@ static inline int fscrypt_prepare_setattr(struct dentry *dentry, } /** - * fscrypt_prepare_symlink - prepare to create a possibly-encrypted symlink + * fscrypt_prepare_symlink() - prepare to create a possibly-encrypted symlink * @dir: directory in which the symlink is being created * @target: plaintext symlink target * @len: length of @target excluding null terminator @@ -687,7 +692,7 @@ static inline int fscrypt_prepare_symlink(struct inode *dir, } /** - * fscrypt_encrypt_symlink - encrypt the symlink target if needed + * fscrypt_encrypt_symlink() - encrypt the symlink target if needed * @inode: symlink inode * @target: plaintext symlink target * @len: length of @target excluding null terminator From fe015a78e5d0139cb126e8dbfc46a80be2bd27ad Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 11 May 2020 12:13:57 -0700 Subject: [PATCH 09/43] fscrypt: name all function parameters Name all the function parameters. This makes it so that we don't have a mix of both styles, so it won't be ambiguous what to use in new fscrypt patches. This also matches the checkpatch expectation. Link: https://lore.kernel.org/r/20200511191358.53096-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/fscrypt.h | 46 ++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index cb2c41f8dfde..210a05dd9ecd 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -56,10 +56,11 @@ struct fscrypt_name { struct fscrypt_operations { unsigned int flags; const char *key_prefix; - int (*get_context)(struct inode *, void *, size_t); - int (*set_context)(struct inode *, const void *, size_t, void *); - bool (*dummy_context)(struct inode *); - bool (*empty_dir)(struct inode *); + int (*get_context)(struct inode *inode, void *ctx, size_t len); + int (*set_context)(struct inode *inode, const void *ctx, size_t len, + void *fs_data); + bool (*dummy_context)(struct inode *inode); + bool (*empty_dir)(struct inode *inode); unsigned int max_namelen; bool (*has_stable_inodes)(struct super_block *sb); void (*get_ino_and_lblk_bits)(struct super_block *sb, @@ -137,13 +138,15 @@ static inline struct page *fscrypt_pagecache_page(struct page *bounce_page) extern void fscrypt_free_bounce_page(struct page *bounce_page); /* policy.c */ -extern int fscrypt_ioctl_set_policy(struct file *, const void __user *); -extern int fscrypt_ioctl_get_policy(struct file *, void __user *); -extern int fscrypt_ioctl_get_policy_ex(struct file *, void __user *); +extern int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg); +extern int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg); +extern int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg); extern int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg); -extern int fscrypt_has_permitted_context(struct inode *, struct inode *); -extern int fscrypt_inherit_context(struct inode *, struct inode *, - void *, bool); +extern int fscrypt_has_permitted_context(struct inode *parent, + struct inode *child); +extern int fscrypt_inherit_context(struct inode *parent, struct inode *child, + void *fs_data, bool preload); + /* keyring.c */ extern void fscrypt_sb_free(struct super_block *sb); extern int fscrypt_ioctl_add_key(struct file *filp, void __user *arg); @@ -153,23 +156,24 @@ extern int fscrypt_ioctl_remove_key_all_users(struct file *filp, extern int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg); /* keysetup.c */ -extern int fscrypt_get_encryption_info(struct inode *); -extern void fscrypt_put_encryption_info(struct inode *); -extern void fscrypt_free_inode(struct inode *); +extern int fscrypt_get_encryption_info(struct inode *inode); +extern void fscrypt_put_encryption_info(struct inode *inode); +extern void fscrypt_free_inode(struct inode *inode); extern int fscrypt_drop_inode(struct inode *inode); /* fname.c */ -extern int fscrypt_setup_filename(struct inode *, const struct qstr *, - int lookup, struct fscrypt_name *); +extern int fscrypt_setup_filename(struct inode *inode, const struct qstr *iname, + int lookup, struct fscrypt_name *fname); static inline void fscrypt_free_filename(struct fscrypt_name *fname) { kfree(fname->crypto_buf.name); } -extern int fscrypt_fname_alloc_buffer(const struct inode *, u32, - struct fscrypt_str *); -extern void fscrypt_fname_free_buffer(struct fscrypt_str *); +extern int fscrypt_fname_alloc_buffer(const struct inode *inode, + u32 max_encrypted_len, + struct fscrypt_str *crypto_str); +extern void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str); extern int fscrypt_fname_disk_to_usr(const struct inode *inode, u32 hash, u32 minor_hash, const struct fscrypt_str *iname, @@ -180,9 +184,9 @@ extern u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name); /* bio.c */ -extern void fscrypt_decrypt_bio(struct bio *); -extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t, - unsigned int); +extern void fscrypt_decrypt_bio(struct bio *bio); +extern int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, + sector_t pblk, unsigned int len); /* hooks.c */ extern int fscrypt_file_open(struct inode *inode, struct file *filp); From 607009020a5e7fd9353fb2dd4cdcc73e26f3350f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 11 May 2020 12:13:58 -0700 Subject: [PATCH 10/43] fscrypt: remove unnecessary extern keywords Remove the unnecessary 'extern' keywords from function declarations. This makes it so that we don't have a mix of both styles, so it won't be ambiguous what to use in new fscrypt patches. This also makes the code shorter and matches the 'checkpatch --strict' expectation. Link: https://lore.kernel.org/r/20200511191358.53096-4-ebiggers@kernel.org Signed-off-by: Eric Biggers --- fs/crypto/fscrypt_private.h | 82 +++++++++++----------- include/linux/fscrypt.h | 134 +++++++++++++++++------------------- 2 files changed, 102 insertions(+), 114 deletions(-) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index f547094100be..20cbd9a4b28b 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -231,15 +231,14 @@ typedef enum { /* crypto.c */ extern struct kmem_cache *fscrypt_info_cachep; -extern int fscrypt_initialize(unsigned int cop_flags); -extern int fscrypt_crypt_block(const struct inode *inode, - fscrypt_direction_t rw, u64 lblk_num, - struct page *src_page, struct page *dest_page, - unsigned int len, unsigned int offs, - gfp_t gfp_flags); -extern struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags); +int fscrypt_initialize(unsigned int cop_flags); +int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, + u64 lblk_num, struct page *src_page, + struct page *dest_page, unsigned int len, + unsigned int offs, gfp_t gfp_flags); +struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags); -extern void __printf(3, 4) __cold +void __printf(3, 4) __cold fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...); #define fscrypt_warn(inode, fmt, ...) \ @@ -264,12 +263,10 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, const struct fscrypt_info *ci); /* fname.c */ -extern int fscrypt_fname_encrypt(const struct inode *inode, - const struct qstr *iname, - u8 *out, unsigned int olen); -extern bool fscrypt_fname_encrypted_size(const struct inode *inode, - u32 orig_len, u32 max_len, - u32 *encrypted_len_ret); +int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, + u8 *out, unsigned int olen); +bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, + u32 max_len, u32 *encrypted_len_ret); extern const struct dentry_operations fscrypt_d_ops; /* hkdf.c */ @@ -278,8 +275,8 @@ struct fscrypt_hkdf { struct crypto_shash *hmac_tfm; }; -extern int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, - unsigned int master_key_size); +int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, + unsigned int master_key_size); /* * The list of contexts in which fscrypt uses HKDF. These values are used as @@ -294,11 +291,11 @@ extern int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, #define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4 #define HKDF_CONTEXT_DIRHASH_KEY 5 -extern int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, - const u8 *info, unsigned int infolen, - u8 *okm, unsigned int okmlen); +int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, + const u8 *info, unsigned int infolen, + u8 *okm, unsigned int okmlen); -extern void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf); +void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf); /* keyring.c */ @@ -436,14 +433,14 @@ static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec) return 0; } -extern struct key * +struct key * fscrypt_find_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec); -extern int fscrypt_verify_key_added(struct super_block *sb, - const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); +int fscrypt_verify_key_added(struct super_block *sb, + const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); -extern int __init fscrypt_init_keyring(void); +int __init fscrypt_init_keyring(void); /* keysetup.c */ @@ -457,33 +454,32 @@ struct fscrypt_mode { extern struct fscrypt_mode fscrypt_modes[]; -extern struct crypto_skcipher * -fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key, - const struct inode *inode); +struct crypto_skcipher *fscrypt_allocate_skcipher(struct fscrypt_mode *mode, + const u8 *raw_key, + const struct inode *inode); -extern int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, - const u8 *raw_key); +int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key); -extern int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, - const struct fscrypt_master_key *mk); +int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, + const struct fscrypt_master_key *mk); /* keysetup_v1.c */ -extern void fscrypt_put_direct_key(struct fscrypt_direct_key *dk); +void fscrypt_put_direct_key(struct fscrypt_direct_key *dk); -extern int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, - const u8 *raw_master_key); +int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, + const u8 *raw_master_key); + +int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci); -extern int fscrypt_setup_v1_file_key_via_subscribed_keyrings( - struct fscrypt_info *ci); /* policy.c */ -extern bool fscrypt_policies_equal(const union fscrypt_policy *policy1, - const union fscrypt_policy *policy2); -extern bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, - const struct inode *inode); -extern int fscrypt_policy_from_context(union fscrypt_policy *policy_u, - const union fscrypt_context *ctx_u, - int ctx_size); +bool fscrypt_policies_equal(const union fscrypt_policy *policy1, + const union fscrypt_policy *policy2); +bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, + const struct inode *inode); +int fscrypt_policy_from_context(union fscrypt_policy *policy_u, + const union fscrypt_context *ctx_u, + int ctx_size); #endif /* _FSCRYPT_PRIVATE_H */ diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 210a05dd9ecd..0e0c7ad19383 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -108,22 +108,21 @@ static inline void fscrypt_handle_d_move(struct dentry *dentry) } /* crypto.c */ -extern void fscrypt_enqueue_decrypt_work(struct work_struct *); +void fscrypt_enqueue_decrypt_work(struct work_struct *); -extern struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, - unsigned int len, - unsigned int offs, - gfp_t gfp_flags); -extern int fscrypt_encrypt_block_inplace(const struct inode *inode, - struct page *page, unsigned int len, - unsigned int offs, u64 lblk_num, - gfp_t gfp_flags); +struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, + unsigned int len, + unsigned int offs, + gfp_t gfp_flags); +int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, + unsigned int len, unsigned int offs, + u64 lblk_num, gfp_t gfp_flags); -extern int fscrypt_decrypt_pagecache_blocks(struct page *page, unsigned int len, - unsigned int offs); -extern int fscrypt_decrypt_block_inplace(const struct inode *inode, - struct page *page, unsigned int len, - unsigned int offs, u64 lblk_num); +int fscrypt_decrypt_pagecache_blocks(struct page *page, unsigned int len, + unsigned int offs); +int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, + unsigned int len, unsigned int offs, + u64 lblk_num); static inline bool fscrypt_is_bounce_page(struct page *page) { @@ -135,81 +134,74 @@ static inline struct page *fscrypt_pagecache_page(struct page *bounce_page) return (struct page *)page_private(bounce_page); } -extern void fscrypt_free_bounce_page(struct page *bounce_page); +void fscrypt_free_bounce_page(struct page *bounce_page); /* policy.c */ -extern int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg); -extern int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg); -extern int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg); -extern int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg); -extern int fscrypt_has_permitted_context(struct inode *parent, - struct inode *child); -extern int fscrypt_inherit_context(struct inode *parent, struct inode *child, - void *fs_data, bool preload); +int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg); +int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg); +int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg); +int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg); +int fscrypt_has_permitted_context(struct inode *parent, struct inode *child); +int fscrypt_inherit_context(struct inode *parent, struct inode *child, + void *fs_data, bool preload); /* keyring.c */ -extern void fscrypt_sb_free(struct super_block *sb); -extern int fscrypt_ioctl_add_key(struct file *filp, void __user *arg); -extern int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg); -extern int fscrypt_ioctl_remove_key_all_users(struct file *filp, - void __user *arg); -extern int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg); +void fscrypt_sb_free(struct super_block *sb); +int fscrypt_ioctl_add_key(struct file *filp, void __user *arg); +int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg); +int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *arg); +int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg); /* keysetup.c */ -extern int fscrypt_get_encryption_info(struct inode *inode); -extern void fscrypt_put_encryption_info(struct inode *inode); -extern void fscrypt_free_inode(struct inode *inode); -extern int fscrypt_drop_inode(struct inode *inode); +int fscrypt_get_encryption_info(struct inode *inode); +void fscrypt_put_encryption_info(struct inode *inode); +void fscrypt_free_inode(struct inode *inode); +int fscrypt_drop_inode(struct inode *inode); /* fname.c */ -extern int fscrypt_setup_filename(struct inode *inode, const struct qstr *iname, - int lookup, struct fscrypt_name *fname); +int fscrypt_setup_filename(struct inode *inode, const struct qstr *iname, + int lookup, struct fscrypt_name *fname); static inline void fscrypt_free_filename(struct fscrypt_name *fname) { kfree(fname->crypto_buf.name); } -extern int fscrypt_fname_alloc_buffer(const struct inode *inode, - u32 max_encrypted_len, - struct fscrypt_str *crypto_str); -extern void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str); -extern int fscrypt_fname_disk_to_usr(const struct inode *inode, - u32 hash, u32 minor_hash, - const struct fscrypt_str *iname, - struct fscrypt_str *oname); -extern bool fscrypt_match_name(const struct fscrypt_name *fname, - const u8 *de_name, u32 de_name_len); -extern u64 fscrypt_fname_siphash(const struct inode *dir, - const struct qstr *name); +int fscrypt_fname_alloc_buffer(const struct inode *inode, u32 max_encrypted_len, + struct fscrypt_str *crypto_str); +void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str); +int fscrypt_fname_disk_to_usr(const struct inode *inode, + u32 hash, u32 minor_hash, + const struct fscrypt_str *iname, + struct fscrypt_str *oname); +bool fscrypt_match_name(const struct fscrypt_name *fname, + const u8 *de_name, u32 de_name_len); +u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name); /* bio.c */ -extern void fscrypt_decrypt_bio(struct bio *bio); -extern int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, - sector_t pblk, unsigned int len); +void fscrypt_decrypt_bio(struct bio *bio); +int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, + sector_t pblk, unsigned int len); /* hooks.c */ -extern int fscrypt_file_open(struct inode *inode, struct file *filp); -extern int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, - struct dentry *dentry); -extern int __fscrypt_prepare_rename(struct inode *old_dir, - struct dentry *old_dentry, - struct inode *new_dir, - struct dentry *new_dentry, - unsigned int flags); -extern int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, - struct fscrypt_name *fname); -extern int fscrypt_prepare_setflags(struct inode *inode, - unsigned int oldflags, unsigned int flags); -extern int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len, - unsigned int max_len, - struct fscrypt_str *disk_link); -extern int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, - unsigned int len, - struct fscrypt_str *disk_link); -extern const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, - unsigned int max_size, - struct delayed_call *done); +int fscrypt_file_open(struct inode *inode, struct file *filp); +int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, + struct dentry *dentry); +int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags); +int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, + struct fscrypt_name *fname); +int fscrypt_prepare_setflags(struct inode *inode, + unsigned int oldflags, unsigned int flags); +int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len, + unsigned int max_len, + struct fscrypt_str *disk_link); +int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, + unsigned int len, struct fscrypt_str *disk_link); +const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, + unsigned int max_size, + struct delayed_call *done); static inline void fscrypt_set_ops(struct super_block *sb, const struct fscrypt_operations *s_cop) { From 8b85996095049a2216c0a6b582330ec75913243c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 12 May 2020 16:32:48 -0700 Subject: [PATCH 11/43] linux/parser.h: add include guards is missing include guards. Add them. This is needed to allow declaring a function in that takes a substring_t parameter. Link: https://lore.kernel.org/r/20200512233251.118314-2-ebiggers@kernel.org Reviewed-by: Theodore Ts'o Reviewed-by: Jaegeuk Kim Signed-off-by: Eric Biggers --- include/linux/parser.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/parser.h b/include/linux/parser.h index 12fc3482f5fc..89e2b23fb888 100644 --- a/include/linux/parser.h +++ b/include/linux/parser.h @@ -7,7 +7,8 @@ * but could potentially be used anywhere else that simple option=arg * parsing is required. */ - +#ifndef _LINUX_PARSER_H +#define _LINUX_PARSER_H /* associates an integer enumerator with a pattern string. */ struct match_token { @@ -34,3 +35,5 @@ int match_hex(substring_t *, int *result); bool match_wildcard(const char *pattern, const char *str); size_t match_strlcpy(char *, const substring_t *, size_t); char *match_strdup(const substring_t *); + +#endif /* _LINUX_PARSER_H */ From cdeb21da1783afe26a827eb62d61084f93720be9 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 12 May 2020 16:32:49 -0700 Subject: [PATCH 12/43] fscrypt: add fscrypt_add_test_dummy_key() Currently, the test_dummy_encryption mount option (which is used for encryption I/O testing with xfstests) uses v1 encryption policies, and it relies on userspace inserting a test key into the session keyring. We need test_dummy_encryption to support v2 encryption policies too. Requiring userspace to add the test key doesn't work well with v2 policies, since v2 policies only support the filesystem keyring (not the session keyring), and keys in the filesystem keyring are lost when the filesystem is unmounted. Hooking all test code that unmounts and re-mounts the filesystem would be difficult. Instead, let's make the filesystem automatically add the test key to its keyring when test_dummy_encryption is enabled. That puts the responsibility for choosing the test key on the kernel. We could just hard-code a key. But out of paranoia, let's first try using a per-boot random key, to prevent this code from being misused. A per-boot key will work as long as no one expects dummy-encrypted files to remain accessible after a reboot. (gce-xfstests doesn't.) Therefore, this patch adds a function fscrypt_add_test_dummy_key() which implements the above. The next patch will use it. Link: https://lore.kernel.org/r/20200512233251.118314-3-ebiggers@kernel.org Reviewed-by: Theodore Ts'o Reviewed-by: Jaegeuk Kim Signed-off-by: Eric Biggers --- fs/crypto/fscrypt_private.h | 3 + fs/crypto/keyring.c | 119 +++++++++++++++++++++++------------- 2 files changed, 78 insertions(+), 44 deletions(-) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 20cbd9a4b28b..855ea935f5a6 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -437,6 +437,9 @@ struct key * fscrypt_find_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec); +int fscrypt_add_test_dummy_key(struct super_block *sb, + struct fscrypt_key_specifier *key_spec); + int fscrypt_verify_key_added(struct super_block *sb, const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index ab41b25d4fa1..c983ddfde8ad 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -20,6 +20,7 @@ #include #include +#include #include #include "fscrypt_private.h" @@ -424,9 +425,9 @@ static int add_existing_master_key(struct fscrypt_master_key *mk, return 0; } -static int add_master_key(struct super_block *sb, - struct fscrypt_master_key_secret *secret, - const struct fscrypt_key_specifier *mk_spec) +static int do_add_master_key(struct super_block *sb, + struct fscrypt_master_key_secret *secret, + const struct fscrypt_key_specifier *mk_spec) { static DEFINE_MUTEX(fscrypt_add_key_mutex); struct key *key; @@ -465,6 +466,35 @@ out_unlock: return err; } +static int add_master_key(struct super_block *sb, + struct fscrypt_master_key_secret *secret, + struct fscrypt_key_specifier *key_spec) +{ + int err; + + if (key_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) { + err = fscrypt_init_hkdf(&secret->hkdf, secret->raw, + secret->size); + if (err) + return err; + + /* + * Now that the HKDF context is initialized, the raw key is no + * longer needed. + */ + memzero_explicit(secret->raw, secret->size); + + /* Calculate the key identifier */ + err = fscrypt_hkdf_expand(&secret->hkdf, + HKDF_CONTEXT_KEY_IDENTIFIER, NULL, 0, + key_spec->u.identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE); + if (err) + return err; + } + return do_add_master_key(sb, secret, key_spec); +} + static int fscrypt_provisioning_key_preparse(struct key_preparsed_payload *prep) { const struct fscrypt_provisioning_key_payload *payload = prep->data; @@ -609,6 +639,15 @@ int fscrypt_ioctl_add_key(struct file *filp, void __user *_uarg) if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved))) return -EINVAL; + /* + * Only root can add keys that are identified by an arbitrary descriptor + * rather than by a cryptographic hash --- since otherwise a malicious + * user could add the wrong key. + */ + if (arg.key_spec.type == FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && + !capable(CAP_SYS_ADMIN)) + return -EACCES; + memset(&secret, 0, sizeof(secret)); if (arg.key_id) { if (arg.raw_size != 0) @@ -626,54 +665,46 @@ int fscrypt_ioctl_add_key(struct file *filp, void __user *_uarg) goto out_wipe_secret; } - switch (arg.key_spec.type) { - case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: - /* - * Only root can add keys that are identified by an arbitrary - * descriptor rather than by a cryptographic hash --- since - * otherwise a malicious user could add the wrong key. - */ - err = -EACCES; - if (!capable(CAP_SYS_ADMIN)) - goto out_wipe_secret; - break; - case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: - err = fscrypt_init_hkdf(&secret.hkdf, secret.raw, secret.size); - if (err) - goto out_wipe_secret; - - /* - * Now that the HKDF context is initialized, the raw key is no - * longer needed. - */ - memzero_explicit(secret.raw, secret.size); - - /* Calculate the key identifier and return it to userspace. */ - err = fscrypt_hkdf_expand(&secret.hkdf, - HKDF_CONTEXT_KEY_IDENTIFIER, - NULL, 0, arg.key_spec.u.identifier, - FSCRYPT_KEY_IDENTIFIER_SIZE); - if (err) - goto out_wipe_secret; - err = -EFAULT; - if (copy_to_user(uarg->key_spec.u.identifier, - arg.key_spec.u.identifier, - FSCRYPT_KEY_IDENTIFIER_SIZE)) - goto out_wipe_secret; - break; - default: - WARN_ON(1); - err = -EINVAL; - goto out_wipe_secret; - } - err = add_master_key(sb, &secret, &arg.key_spec); + if (err) + goto out_wipe_secret; + + /* Return the key identifier to userspace, if applicable */ + err = -EFAULT; + if (arg.key_spec.type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER && + copy_to_user(uarg->key_spec.u.identifier, arg.key_spec.u.identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE)) + goto out_wipe_secret; + err = 0; out_wipe_secret: wipe_master_key_secret(&secret); return err; } EXPORT_SYMBOL_GPL(fscrypt_ioctl_add_key); +/* + * Add the key for '-o test_dummy_encryption' to the filesystem keyring. + * + * Use a per-boot random key to prevent people from misusing this option. + */ +int fscrypt_add_test_dummy_key(struct super_block *sb, + struct fscrypt_key_specifier *key_spec) +{ + static u8 test_key[FSCRYPT_MAX_KEY_SIZE]; + struct fscrypt_master_key_secret secret; + int err; + + get_random_once(test_key, FSCRYPT_MAX_KEY_SIZE); + + memset(&secret, 0, sizeof(secret)); + secret.size = FSCRYPT_MAX_KEY_SIZE; + memcpy(secret.raw, test_key, FSCRYPT_MAX_KEY_SIZE); + + err = add_master_key(sb, &secret, key_spec); + wipe_master_key_secret(&secret); + return err; +} + /* * Verify that the current user has added a master key with the given identifier * (returns -ENOKEY if not). This is needed to prevent a user from encrypting From ed318a6cc0b620440e65f48eb527dc3df7269ce4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 12 May 2020 16:32:50 -0700 Subject: [PATCH 13/43] fscrypt: support test_dummy_encryption=v2 v1 encryption policies are deprecated in favor of v2, and some new features (e.g. encryption+casefolding) are only being added for v2. Therefore, the "test_dummy_encryption" mount option (which is used for encryption I/O testing with xfstests) needs to support v2 policies. To do this, extend its syntax to be "test_dummy_encryption=v1" or "test_dummy_encryption=v2". The existing "test_dummy_encryption" (no argument) also continues to be accepted, to specify the default setting -- currently v1, but the next patch changes it to v2. To cleanly support both v1 and v2 while also making it easy to support specifying other encryption settings in the future (say, accepting "$contents_mode:$filenames_mode:v2"), make ext4 and f2fs maintain a pointer to the dummy fscrypt_context rather than using mount flags. To avoid concurrency issues, don't allow test_dummy_encryption to be set or changed during a remount. (The former restriction is new, but xfstests doesn't run into it, so no one should notice.) Tested with 'gce-xfstests -c {ext4,f2fs}/encrypt -g auto'. On ext4, there are two regressions, both of which are test bugs: ext4/023 and ext4/028 fail because they set an xattr and expect it to be stored inline, but the increase in size of the fscrypt_context from 24 to 40 bytes causes this xattr to be spilled into an external block. Link: https://lore.kernel.org/r/20200512233251.118314-4-ebiggers@kernel.org Acked-by: Jaegeuk Kim Reviewed-by: Theodore Ts'o Signed-off-by: Eric Biggers --- Documentation/filesystems/f2fs.rst | 6 +- fs/crypto/keysetup.c | 15 ++-- fs/crypto/policy.c | 125 +++++++++++++++++++++++++++++ fs/ext4/ext4.h | 7 +- fs/ext4/super.c | 68 ++++++++++++---- fs/ext4/sysfs.c | 2 + fs/f2fs/f2fs.h | 4 +- fs/f2fs/super.c | 85 ++++++++++++++------ fs/f2fs/sysfs.c | 4 + include/linux/fscrypt.h | 51 ++++++++++-- 10 files changed, 307 insertions(+), 60 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 87d794bc75a4..4218ac658629 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -225,8 +225,12 @@ fsync_mode=%s Control the policy of fsync. Currently supports "posix", pass, but the performance will regress. "nobarrier" is based on "posix", but doesn't issue flush command for non-atomic files likewise "nobarrier" mount option. -test_dummy_encryption Enable dummy encryption, which provides a fake fscrypt +test_dummy_encryption +test_dummy_encryption=%s + Enable dummy encryption, which provides a fake fscrypt context. The fake fscrypt context is used by xfstests. + The argument may be either "v1" or "v2", in order to + select the corresponding fscrypt policy version. checkpoint=%s[:%u[%]] Set to "disable" to turn off checkpointing. Set to "enable" to reenable checkpointing. Is enabled by default. While disabled, any unmounting or unexpected shutdowns will cause diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index edc4590c69f0..675479f8e6f3 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -395,21 +395,18 @@ int fscrypt_get_encryption_info(struct inode *inode) res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { - if (!fscrypt_dummy_context_enabled(inode) || - IS_ENCRYPTED(inode)) { + const union fscrypt_context *dummy_ctx = + fscrypt_get_dummy_context(inode->i_sb); + + if (IS_ENCRYPTED(inode) || !dummy_ctx) { fscrypt_warn(inode, "Error %d getting encryption context", res); return res; } /* Fake up a context for an unencrypted directory */ - memset(&ctx, 0, sizeof(ctx)); - ctx.version = FSCRYPT_CONTEXT_V1; - ctx.v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; - ctx.v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; - memset(ctx.v1.master_key_descriptor, 0x42, - FSCRYPT_KEY_DESCRIPTOR_SIZE); - res = sizeof(ctx.v1); + res = fscrypt_context_size(dummy_ctx); + memcpy(&ctx, dummy_ctx, res); } crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_NOFS); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 0a95537fcef9..bcab94b29d10 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -11,6 +11,7 @@ */ #include +#include #include #include #include "fscrypt_private.h" @@ -616,3 +617,127 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, return preload ? fscrypt_get_encryption_info(child): 0; } EXPORT_SYMBOL(fscrypt_inherit_context); + +/** + * fscrypt_set_test_dummy_encryption() - handle '-o test_dummy_encryption' + * @sb: the filesystem on which test_dummy_encryption is being specified + * @arg: the argument to the test_dummy_encryption option. + * If no argument was specified, then @arg->from == NULL. + * @dummy_ctx: the filesystem's current dummy context (input/output, see below) + * + * Handle the test_dummy_encryption mount option by creating a dummy encryption + * context, saving it in @dummy_ctx, and adding the corresponding dummy + * encryption key to the filesystem. If the @dummy_ctx is already set, then + * instead validate that it matches @arg. Don't support changing it via + * remount, as that is difficult to do safely. + * + * The reason we use an fscrypt_context rather than an fscrypt_policy is because + * we mustn't generate a new nonce each time we access a dummy-encrypted + * directory, as that would change the way filenames are encrypted. + * + * Return: 0 on success (dummy context set, or the same context is already set); + * -EEXIST if a different dummy context is already set; + * or another -errno value. + */ +int fscrypt_set_test_dummy_encryption(struct super_block *sb, + const substring_t *arg, + struct fscrypt_dummy_context *dummy_ctx) +{ + const char *argstr = "v1"; + const char *argstr_to_free = NULL; + struct fscrypt_key_specifier key_spec = { 0 }; + int version; + union fscrypt_context *ctx = NULL; + int err; + + if (arg->from) { + argstr = argstr_to_free = match_strdup(arg); + if (!argstr) + return -ENOMEM; + } + + if (!strcmp(argstr, "v1")) { + version = FSCRYPT_CONTEXT_V1; + key_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR; + memset(key_spec.u.descriptor, 0x42, + FSCRYPT_KEY_DESCRIPTOR_SIZE); + } else if (!strcmp(argstr, "v2")) { + version = FSCRYPT_CONTEXT_V2; + key_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; + /* key_spec.u.identifier gets filled in when adding the key */ + } else { + err = -EINVAL; + goto out; + } + + if (dummy_ctx->ctx) { + /* + * Note: if we ever make test_dummy_encryption support + * specifying other encryption settings, such as the encryption + * modes, we'll need to compare those settings here. + */ + if (dummy_ctx->ctx->version == version) + err = 0; + else + err = -EEXIST; + goto out; + } + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) { + err = -ENOMEM; + goto out; + } + + err = fscrypt_add_test_dummy_key(sb, &key_spec); + if (err) + goto out; + + ctx->version = version; + switch (ctx->version) { + case FSCRYPT_CONTEXT_V1: + ctx->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; + ctx->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; + memcpy(ctx->v1.master_key_descriptor, key_spec.u.descriptor, + FSCRYPT_KEY_DESCRIPTOR_SIZE); + break; + case FSCRYPT_CONTEXT_V2: + ctx->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; + ctx->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; + memcpy(ctx->v2.master_key_identifier, key_spec.u.identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE); + break; + default: + WARN_ON(1); + err = -EINVAL; + goto out; + } + dummy_ctx->ctx = ctx; + ctx = NULL; + err = 0; +out: + kfree(ctx); + kfree(argstr_to_free); + return err; +} +EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption); + +/** + * fscrypt_show_test_dummy_encryption() - show '-o test_dummy_encryption' + * @seq: the seq_file to print the option to + * @sep: the separator character to use + * @sb: the filesystem whose options are being shown + * + * Show the test_dummy_encryption mount option, if it was specified. + * This is mainly used for /proc/mounts. + */ +void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, + struct super_block *sb) +{ + const union fscrypt_context *ctx = fscrypt_get_dummy_context(sb); + + if (!ctx) + return; + seq_printf(seq, "%ctest_dummy_encryption=v%d", sep, ctx->version); +} +EXPORT_SYMBOL_GPL(fscrypt_show_test_dummy_encryption); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 91eb4381cae5..546504cba842 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1357,11 +1357,9 @@ struct ext4_super_block { */ #define EXT4_MF_MNTDIR_SAMPLED 0x0001 #define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ -#define EXT4_MF_TEST_DUMMY_ENCRYPTION 0x0004 #ifdef CONFIG_FS_ENCRYPTION -#define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \ - EXT4_MF_TEST_DUMMY_ENCRYPTION)) +#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_ctx.ctx != NULL) #else #define DUMMY_ENCRYPTION_ENABLED(sbi) (0) #endif @@ -1551,6 +1549,9 @@ struct ext4_sb_info { struct ratelimit_state s_warning_ratelimit_state; struct ratelimit_state s_msg_ratelimit_state; + /* Encryption context for '-o test_dummy_encryption' */ + struct fscrypt_dummy_context s_dummy_enc_ctx; + /* * Barrier between writepages ops and changing any inode's JOURNAL_DATA * or EXTENTS flag. diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bf5fcb477f66..4a3d21972011 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1106,6 +1106,7 @@ static void ext4_put_super(struct super_block *sb) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev); + fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx); #ifdef CONFIG_UNICODE utf8_unload(sbi->s_encoding); #endif @@ -1389,9 +1390,10 @@ retry: return res; } -static bool ext4_dummy_context(struct inode *inode) +static const union fscrypt_context * +ext4_get_dummy_context(struct super_block *sb) { - return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb)); + return EXT4_SB(sb)->s_dummy_enc_ctx.ctx; } static bool ext4_has_stable_inodes(struct super_block *sb) @@ -1410,7 +1412,7 @@ static const struct fscrypt_operations ext4_cryptops = { .key_prefix = "ext4:", .get_context = ext4_get_context, .set_context = ext4_set_context, - .dummy_context = ext4_dummy_context, + .get_dummy_context = ext4_get_dummy_context, .empty_dir = ext4_empty_dir, .max_namelen = EXT4_NAME_LEN, .has_stable_inodes = ext4_has_stable_inodes, @@ -1605,6 +1607,7 @@ static const match_table_t tokens = { {Opt_init_itable, "init_itable"}, {Opt_noinit_itable, "noinit_itable"}, {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, + {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption"}, {Opt_nombcache, "nombcache"}, {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ @@ -1816,7 +1819,7 @@ static const struct mount_opts { {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, {Opt_max_dir_size_kb, 0, MOPT_GTE0}, - {Opt_test_dummy_encryption, 0, MOPT_GTE0}, + {Opt_test_dummy_encryption, 0, MOPT_STRING}, {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, {Opt_err, 0, 0} }; @@ -1851,6 +1854,48 @@ static int ext4_sb_read_encoding(const struct ext4_super_block *es, } #endif +static int ext4_set_test_dummy_encryption(struct super_block *sb, + const char *opt, + const substring_t *arg, + bool is_remount) +{ +#ifdef CONFIG_FS_ENCRYPTION + struct ext4_sb_info *sbi = EXT4_SB(sb); + int err; + + /* + * This mount option is just for testing, and it's not worthwhile to + * implement the extra complexity (e.g. RCU protection) that would be + * needed to allow it to be set or changed during remount. We do allow + * it to be specified during remount, but only if there is no change. + */ + if (is_remount && !sbi->s_dummy_enc_ctx.ctx) { + ext4_msg(sb, KERN_WARNING, + "Can't set test_dummy_encryption on remount"); + return -1; + } + err = fscrypt_set_test_dummy_encryption(sb, arg, &sbi->s_dummy_enc_ctx); + if (err) { + if (err == -EEXIST) + ext4_msg(sb, KERN_WARNING, + "Can't change test_dummy_encryption on remount"); + else if (err == -EINVAL) + ext4_msg(sb, KERN_WARNING, + "Value of option \"%s\" is unrecognized", opt); + else + ext4_msg(sb, KERN_WARNING, + "Error processing option \"%s\" [%d]", + opt, err); + return -1; + } + ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled"); +#else + ext4_msg(sb, KERN_WARNING, + "Test dummy encryption mount option ignored"); +#endif + return 1; +} + static int handle_mount_opt(struct super_block *sb, char *opt, int token, substring_t *args, unsigned long *journal_devnum, unsigned int *journal_ioprio, int is_remount) @@ -2047,14 +2092,8 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); } else if (token == Opt_test_dummy_encryption) { -#ifdef CONFIG_FS_ENCRYPTION - sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION; - ext4_msg(sb, KERN_WARNING, - "Test dummy encryption mode enabled"); -#else - ext4_msg(sb, KERN_WARNING, - "Test dummy encryption mount option ignored"); -#endif + return ext4_set_test_dummy_encryption(sb, opt, &args[0], + is_remount); } else if (m->flags & MOPT_DATAJ) { if (is_remount) { if (!sbi->s_journal) @@ -2311,8 +2350,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); if (test_opt(sb, DATA_ERR_ABORT)) SEQ_OPTS_PUTS("data_err=abort"); - if (DUMMY_ENCRYPTION_ENABLED(sbi)) - SEQ_OPTS_PUTS("test_dummy_encryption"); + + fscrypt_show_test_dummy_encryption(seq, sep, sb); ext4_show_quota_options(seq, sb); return 0; @@ -4780,6 +4819,7 @@ failed_mount: for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); #endif + fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx); ext4_blkdev_remove(sbi); brelse(bh); out_fail: diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 04bfaf63752c..6c9fc9e21c13 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -293,6 +293,7 @@ EXT4_ATTR_FEATURE(batched_discard); EXT4_ATTR_FEATURE(meta_bg_resize); #ifdef CONFIG_FS_ENCRYPTION EXT4_ATTR_FEATURE(encryption); +EXT4_ATTR_FEATURE(test_dummy_encryption_v2); #endif #ifdef CONFIG_UNICODE EXT4_ATTR_FEATURE(casefold); @@ -308,6 +309,7 @@ static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(meta_bg_resize), #ifdef CONFIG_FS_ENCRYPTION ATTR_LIST(encryption), + ATTR_LIST(test_dummy_encryption_v2), #endif #ifdef CONFIG_UNICODE ATTR_LIST(casefold), diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ba470d5687fe..157eec348970 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -138,7 +138,7 @@ struct f2fs_mount_info { int fsync_mode; /* fsync policy */ int fs_mode; /* fs mode: LFS or ADAPTIVE */ int bggc_mode; /* bggc mode: off, on or sync */ - bool test_dummy_encryption; /* test dummy encryption */ + struct fscrypt_dummy_context dummy_enc_ctx; /* test dummy encryption */ block_t unusable_cap; /* Amount of space allowed to be * unusable when disabling checkpoint */ @@ -1259,7 +1259,7 @@ enum fsync_mode { #ifdef CONFIG_FS_ENCRYPTION #define DUMMY_ENCRYPTION_ENABLED(sbi) \ - (unlikely(F2FS_OPTION(sbi).test_dummy_encryption)) + (unlikely(F2FS_OPTION(sbi).dummy_enc_ctx.ctx != NULL)) #else #define DUMMY_ENCRYPTION_ENABLED(sbi) (0) #endif diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f2dfc21c6abb..8a9955902d84 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -202,6 +202,7 @@ static match_table_t f2fs_tokens = { {Opt_whint, "whint_mode=%s"}, {Opt_alloc, "alloc_mode=%s"}, {Opt_fsync, "fsync_mode=%s"}, + {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption"}, {Opt_checkpoint_disable, "checkpoint=disable"}, {Opt_checkpoint_disable_cap, "checkpoint=disable:%u"}, @@ -394,7 +395,52 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) } #endif -static int parse_options(struct super_block *sb, char *options) +static int f2fs_set_test_dummy_encryption(struct super_block *sb, + const char *opt, + const substring_t *arg, + bool is_remount) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); +#ifdef CONFIG_FS_ENCRYPTION + int err; + + if (!f2fs_sb_has_encrypt(sbi)) { + f2fs_err(sbi, "Encrypt feature is off"); + return -EINVAL; + } + + /* + * This mount option is just for testing, and it's not worthwhile to + * implement the extra complexity (e.g. RCU protection) that would be + * needed to allow it to be set or changed during remount. We do allow + * it to be specified during remount, but only if there is no change. + */ + if (is_remount && !F2FS_OPTION(sbi).dummy_enc_ctx.ctx) { + f2fs_warn(sbi, "Can't set test_dummy_encryption on remount"); + return -EINVAL; + } + err = fscrypt_set_test_dummy_encryption( + sb, arg, &F2FS_OPTION(sbi).dummy_enc_ctx); + if (err) { + if (err == -EEXIST) + f2fs_warn(sbi, + "Can't change test_dummy_encryption on remount"); + else if (err == -EINVAL) + f2fs_warn(sbi, "Value of option \"%s\" is unrecognized", + opt); + else + f2fs_warn(sbi, "Error processing option \"%s\" [%d]", + opt, err); + return -EINVAL; + } + f2fs_warn(sbi, "Test dummy encryption mode enabled"); +#else + f2fs_warn(sbi, "Test dummy encryption mount option ignored"); +#endif + return 0; +} + +static int parse_options(struct super_block *sb, char *options, bool is_remount) { struct f2fs_sb_info *sbi = F2FS_SB(sb); substring_t args[MAX_OPT_ARGS]; @@ -403,9 +449,7 @@ static int parse_options(struct super_block *sb, char *options) int arg = 0, ext_cnt; kuid_t uid; kgid_t gid; -#ifdef CONFIG_QUOTA int ret; -#endif if (!options) return 0; @@ -778,17 +822,10 @@ static int parse_options(struct super_block *sb, char *options) kvfree(name); break; case Opt_test_dummy_encryption: -#ifdef CONFIG_FS_ENCRYPTION - if (!f2fs_sb_has_encrypt(sbi)) { - f2fs_err(sbi, "Encrypt feature is off"); - return -EINVAL; - } - - F2FS_OPTION(sbi).test_dummy_encryption = true; - f2fs_info(sbi, "Test dummy encryption mode enabled"); -#else - f2fs_info(sbi, "Test dummy encryption mount option ignored"); -#endif + ret = f2fs_set_test_dummy_encryption(sb, p, &args[0], + is_remount); + if (ret) + return ret; break; case Opt_checkpoint_disable_cap_perc: if (args->from && match_int(args, &arg)) @@ -1213,6 +1250,7 @@ static void f2fs_put_super(struct super_block *sb) for (i = 0; i < MAXQUOTAS; i++) kvfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif + fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx); destroy_percpu_info(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); @@ -1543,10 +1581,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",whint_mode=%s", "user-based"); else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) seq_printf(seq, ",whint_mode=%s", "fs-based"); -#ifdef CONFIG_FS_ENCRYPTION - if (F2FS_OPTION(sbi).test_dummy_encryption) - seq_puts(seq, ",test_dummy_encryption"); -#endif + + fscrypt_show_test_dummy_encryption(seq, ',', sbi->sb); if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_DEFAULT) seq_printf(seq, ",alloc_mode=%s", "default"); @@ -1575,7 +1611,6 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; - F2FS_OPTION(sbi).test_dummy_encryption = false; F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; @@ -1734,7 +1769,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) default_options(sbi); /* parse mount options */ - err = parse_options(sb, data); + err = parse_options(sb, data, true); if (err) goto restore_opts; checkpoint_changed = @@ -2410,9 +2445,10 @@ static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, ctx, len, fs_data, XATTR_CREATE); } -static bool f2fs_dummy_context(struct inode *inode) +static const union fscrypt_context * +f2fs_get_dummy_context(struct super_block *sb) { - return DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(inode)); + return F2FS_OPTION(F2FS_SB(sb)).dummy_enc_ctx.ctx; } static bool f2fs_has_stable_inodes(struct super_block *sb) @@ -2431,7 +2467,7 @@ static const struct fscrypt_operations f2fs_cryptops = { .key_prefix = "f2fs:", .get_context = f2fs_get_context, .set_context = f2fs_set_context, - .dummy_context = f2fs_dummy_context, + .get_dummy_context = f2fs_get_dummy_context, .empty_dir = f2fs_empty_dir, .max_namelen = F2FS_NAME_LEN, .has_stable_inodes = f2fs_has_stable_inodes, @@ -3366,7 +3402,7 @@ try_onemore: goto free_sb_buf; } - err = parse_options(sb, options); + err = parse_options(sb, options, false); if (err) goto free_options; @@ -3769,6 +3805,7 @@ free_options: for (i = 0; i < MAXQUOTAS; i++) kvfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif + fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx); kvfree(options); free_sb_buf: kvfree(raw_super); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index e3bbbef9b4f0..3162f46b3c9b 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -446,6 +446,7 @@ enum feat_id { FEAT_SB_CHECKSUM, FEAT_CASEFOLD, FEAT_COMPRESSION, + FEAT_TEST_DUMMY_ENCRYPTION_V2, }; static ssize_t f2fs_feature_show(struct f2fs_attr *a, @@ -466,6 +467,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_SB_CHECKSUM: case FEAT_CASEFOLD: case FEAT_COMPRESSION: + case FEAT_TEST_DUMMY_ENCRYPTION_V2: return sprintf(buf, "supported\n"); } return 0; @@ -563,6 +565,7 @@ F2FS_GENERAL_RO_ATTR(avg_vblocks); #ifdef CONFIG_FS_ENCRYPTION F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); +F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2, FEAT_TEST_DUMMY_ENCRYPTION_V2); #endif #ifdef CONFIG_BLK_DEV_ZONED F2FS_FEATURE_RO_ATTR(block_zoned, FEAT_BLKZONED); @@ -647,6 +650,7 @@ ATTRIBUTE_GROUPS(f2fs); static struct attribute *f2fs_feat_attrs[] = { #ifdef CONFIG_FS_ENCRYPTION ATTR_LIST(encryption), + ATTR_LIST(test_dummy_encryption_v2), #endif #ifdef CONFIG_BLK_DEV_ZONED ATTR_LIST(block_zoned), diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 0e0c7ad19383..2862ca5fea33 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -15,12 +15,15 @@ #include #include +#include #include #include #define FS_CRYPTO_BLOCK_SIZE 16 +union fscrypt_context; struct fscrypt_info; +struct seq_file; struct fscrypt_str { unsigned char *name; @@ -59,7 +62,8 @@ struct fscrypt_operations { int (*get_context)(struct inode *inode, void *ctx, size_t len); int (*set_context)(struct inode *inode, const void *ctx, size_t len, void *fs_data); - bool (*dummy_context)(struct inode *inode); + const union fscrypt_context *(*get_dummy_context)( + struct super_block *sb); bool (*empty_dir)(struct inode *inode); unsigned int max_namelen; bool (*has_stable_inodes)(struct super_block *sb); @@ -89,10 +93,12 @@ static inline bool fscrypt_needs_contents_encryption(const struct inode *inode) return IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode); } -static inline bool fscrypt_dummy_context_enabled(struct inode *inode) +static inline const union fscrypt_context * +fscrypt_get_dummy_context(struct super_block *sb) { - return inode->i_sb->s_cop->dummy_context && - inode->i_sb->s_cop->dummy_context(inode); + if (!sb->s_cop->get_dummy_context) + return NULL; + return sb->s_cop->get_dummy_context(sb); } /* @@ -145,6 +151,22 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child); int fscrypt_inherit_context(struct inode *parent, struct inode *child, void *fs_data, bool preload); +struct fscrypt_dummy_context { + const union fscrypt_context *ctx; +}; + +int fscrypt_set_test_dummy_encryption(struct super_block *sb, + const substring_t *arg, + struct fscrypt_dummy_context *dummy_ctx); +void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, + struct super_block *sb); +static inline void +fscrypt_free_dummy_context(struct fscrypt_dummy_context *dummy_ctx) +{ + kfree(dummy_ctx->ctx); + dummy_ctx->ctx = NULL; +} + /* keyring.c */ void fscrypt_sb_free(struct super_block *sb); int fscrypt_ioctl_add_key(struct file *filp, void __user *arg); @@ -219,9 +241,10 @@ static inline bool fscrypt_needs_contents_encryption(const struct inode *inode) return false; } -static inline bool fscrypt_dummy_context_enabled(struct inode *inode) +static inline const union fscrypt_context * +fscrypt_get_dummy_context(struct super_block *sb) { - return false; + return NULL; } static inline void fscrypt_handle_d_move(struct dentry *dentry) @@ -316,6 +339,20 @@ static inline int fscrypt_inherit_context(struct inode *parent, return -EOPNOTSUPP; } +struct fscrypt_dummy_context { +}; + +static inline void fscrypt_show_test_dummy_encryption(struct seq_file *seq, + char sep, + struct super_block *sb) +{ +} + +static inline void +fscrypt_free_dummy_context(struct fscrypt_dummy_context *dummy_ctx) +{ +} + /* keyring.c */ static inline void fscrypt_sb_free(struct super_block *sb) { @@ -677,7 +714,7 @@ static inline int fscrypt_prepare_symlink(struct inode *dir, unsigned int max_len, struct fscrypt_str *disk_link) { - if (IS_ENCRYPTED(dir) || fscrypt_dummy_context_enabled(dir)) + if (IS_ENCRYPTED(dir) || fscrypt_get_dummy_context(dir->i_sb) != NULL) return __fscrypt_prepare_symlink(dir, len, max_len, disk_link); disk_link->name = (unsigned char *)target; From 0ca2ddb0cd3c587ca50a29af7969bbfecbc3d663 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 12 May 2020 16:32:51 -0700 Subject: [PATCH 14/43] fscrypt: make test_dummy_encryption use v2 by default Since v1 encryption policies are deprecated, make test_dummy_encryption test v2 policies by default. Note that this causes ext4/023 and ext4/028 to start failing due to known bugs in those tests (see previous commit). Link: https://lore.kernel.org/r/20200512233251.118314-5-ebiggers@kernel.org Reviewed-by: Jaegeuk Kim Reviewed-by: Theodore Ts'o Signed-off-by: Eric Biggers --- fs/crypto/policy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index bcab94b29d10..a15aec8e068c 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -643,7 +643,7 @@ int fscrypt_set_test_dummy_encryption(struct super_block *sb, const substring_t *arg, struct fscrypt_dummy_context *dummy_ctx) { - const char *argstr = "v1"; + const char *argstr = "v2"; const char *argstr_to_free = NULL; struct fscrypt_key_specifier key_spec = { 0 }; int version; From e3b1078bedd323df343894a27eb3b3c34944dfd1 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 15 May 2020 13:41:41 -0700 Subject: [PATCH 15/43] fscrypt: add support for IV_INO_LBLK_32 policies The eMMC inline crypto standard will only specify 32 DUN bits (a.k.a. IV bits), unlike UFS's 64. IV_INO_LBLK_64 is therefore not applicable, but an encryption format which uses one key per policy and permits the moving of encrypted file contents (as f2fs's garbage collector requires) is still desirable. To support such hardware, add a new encryption format IV_INO_LBLK_32 that makes the best use of the 32 bits: the IV is set to 'SipHash-2-4(inode_number) + file_logical_block_number mod 2^32', where the SipHash key is derived from the fscrypt master key. We hash only the inode number and not also the block number, because we need to maintain contiguity of DUNs to merge bios. Unlike with IV_INO_LBLK_64, with this format IV reuse is possible; this is unavoidable given the size of the DUN. This means this format should only be used where the requirements of the first paragraph apply. However, the hash spreads out the IVs in the whole usable range, and the use of a keyed hash makes it difficult for an attacker to determine which files use which IVs. Besides the above differences, this flag works like IV_INO_LBLK_64 in that on ext4 it is only allowed if the stable_inodes feature has been enabled to prevent inode numbers and the filesystem UUID from changing. Link: https://lore.kernel.org/r/20200515204141.251098-1-ebiggers@kernel.org Reviewed-by: Theodore Ts'o Reviewed-by: Paul Crowley Signed-off-by: Eric Biggers --- Documentation/filesystems/fscrypt.rst | 33 +++++++++-- fs/crypto/crypto.c | 6 +- fs/crypto/fscrypt_private.h | 20 +++++-- fs/crypto/keyring.c | 5 +- fs/crypto/keysetup.c | 85 +++++++++++++++++++++------ fs/crypto/policy.c | 51 +++++++++++----- include/uapi/linux/fscrypt.h | 3 +- 7 files changed, 157 insertions(+), 46 deletions(-) diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst index aa072112cfff..f517af8ec11c 100644 --- a/Documentation/filesystems/fscrypt.rst +++ b/Documentation/filesystems/fscrypt.rst @@ -292,8 +292,22 @@ files' data differently, inode numbers are included in the IVs. Consequently, shrinking the filesystem may not be allowed. This format is optimized for use with inline encryption hardware -compliant with the UFS or eMMC standards, which support only 64 IV -bits per I/O request and may have only a small number of keyslots. +compliant with the UFS standard, which supports only 64 IV bits per +I/O request and may have only a small number of keyslots. + +IV_INO_LBLK_32 policies +----------------------- + +IV_INO_LBLK_32 policies work like IV_INO_LBLK_64, except that for +IV_INO_LBLK_32, the inode number is hashed with SipHash-2-4 (where the +SipHash key is derived from the master key) and added to the file +logical block number mod 2^32 to produce a 32-bit IV. + +This format is optimized for use with inline encryption hardware +compliant with the eMMC v5.2 standard, which supports only 32 IV bits +per I/O request and may have only a small number of keyslots. This +format results in some level of IV reuse, so it should only be used +when necessary due to hardware limitations. Key identifiers --------------- @@ -369,6 +383,10 @@ a little endian number, except that: to 32 bits and is placed in bits 0-31 of the IV. The inode number (which is also limited to 32 bits) is placed in bits 32-63. +- With `IV_INO_LBLK_32 policies`_, the logical block number is limited + to 32 bits and is placed in bits 0-31 of the IV. The inode number + is then hashed and added mod 2^32. + Note that because file logical block numbers are included in the IVs, filesystems must enforce that blocks are never shifted around within encrypted files, e.g. via "collapse range" or "insert range". @@ -465,8 +483,15 @@ This structure must be initialized as follows: (0x3). - FSCRYPT_POLICY_FLAG_DIRECT_KEY: See `DIRECT_KEY policies`_. - FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64: See `IV_INO_LBLK_64 - policies`_. This is mutually exclusive with DIRECT_KEY and is not - supported on v1 policies. + policies`_. + - FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32: See `IV_INO_LBLK_32 + policies`_. + + v1 encryption policies only support the PAD_* and DIRECT_KEY flags. + The other flags are only supported by v2 encryption policies. + + The DIRECT_KEY, IV_INO_LBLK_64, and IV_INO_LBLK_32 flags are + mutually exclusive. - For v2 encryption policies, ``__reserved`` must be zeroed. diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 40c2821a341e..ed015cb66c7c 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -77,8 +77,12 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, memset(iv, 0, ci->ci_mode->ivsize); if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) { - WARN_ON_ONCE((u32)lblk_num != lblk_num); + WARN_ON_ONCE(lblk_num > U32_MAX); + WARN_ON_ONCE(ci->ci_inode->i_ino > U32_MAX); lblk_num |= (u64)ci->ci_inode->i_ino << 32; + } else if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) { + WARN_ON_ONCE(lblk_num > U32_MAX); + lblk_num = (u32)(ci->ci_hashed_ino + lblk_num); } else if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) { memcpy(iv->nonce, ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE); } diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 855ea935f5a6..eb7fcd2b7fb8 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -222,6 +222,9 @@ struct fscrypt_info { /* This inode's nonce, copied from the fscrypt_context */ u8 ci_nonce[FS_KEY_DERIVATION_NONCE_SIZE]; + + /* Hashed inode number. Only set for IV_INO_LBLK_32 */ + u32 ci_hashed_ino; }; typedef enum { @@ -290,6 +293,8 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, #define HKDF_CONTEXT_DIRECT_KEY 3 #define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4 #define HKDF_CONTEXT_DIRHASH_KEY 5 +#define HKDF_CONTEXT_IV_INO_LBLK_32_KEY 6 +#define HKDF_CONTEXT_INODE_HASH_KEY 7 int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, const u8 *info, unsigned int infolen, @@ -386,14 +391,17 @@ struct fscrypt_master_key { struct list_head mk_decrypted_inodes; spinlock_t mk_decrypted_inodes_lock; - /* Crypto API transforms for DIRECT_KEY policies, allocated on-demand */ - struct crypto_skcipher *mk_direct_tfms[__FSCRYPT_MODE_MAX + 1]; - /* - * Crypto API transforms for filesystem-layer implementation of - * IV_INO_LBLK_64 policies, allocated on-demand. + * Per-mode encryption keys for the various types of encryption policies + * that use them. Allocated and derived on-demand. */ - struct crypto_skcipher *mk_iv_ino_lblk_64_tfms[__FSCRYPT_MODE_MAX + 1]; + struct crypto_skcipher *mk_direct_keys[__FSCRYPT_MODE_MAX + 1]; + struct crypto_skcipher *mk_iv_ino_lblk_64_keys[__FSCRYPT_MODE_MAX + 1]; + struct crypto_skcipher *mk_iv_ino_lblk_32_keys[__FSCRYPT_MODE_MAX + 1]; + + /* Hash key for inode numbers. Initialized only when needed. */ + siphash_key_t mk_ino_hash_key; + bool mk_ino_hash_key_initialized; } __randomize_layout; diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index c983ddfde8ad..e24eb48bfbe1 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -45,8 +45,9 @@ static void free_master_key(struct fscrypt_master_key *mk) wipe_master_key_secret(&mk->mk_secret); for (i = 0; i <= __FSCRYPT_MODE_MAX; i++) { - crypto_free_skcipher(mk->mk_direct_tfms[i]); - crypto_free_skcipher(mk->mk_iv_ino_lblk_64_tfms[i]); + crypto_free_skcipher(mk->mk_direct_keys[i]); + crypto_free_skcipher(mk->mk_iv_ino_lblk_64_keys[i]); + crypto_free_skcipher(mk->mk_iv_ino_lblk_32_keys[i]); } key_put(mk->mk_users); diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 675479f8e6f3..1129adfa097d 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -46,6 +46,8 @@ struct fscrypt_mode fscrypt_modes[] = { }, }; +static DEFINE_MUTEX(fscrypt_mode_key_setup_mutex); + static struct fscrypt_mode * select_encryption_mode(const union fscrypt_policy *policy, const struct inode *inode) @@ -130,7 +132,7 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci, const struct super_block *sb = inode->i_sb; struct fscrypt_mode *mode = ci->ci_mode; const u8 mode_num = mode - fscrypt_modes; - struct crypto_skcipher *tfm, *prev_tfm; + struct crypto_skcipher *tfm; u8 mode_key[FSCRYPT_MAX_KEY_SIZE]; u8 hkdf_info[sizeof(mode_num) + sizeof(sb->s_uuid)]; unsigned int hkdf_infolen = 0; @@ -139,10 +141,17 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci, if (WARN_ON(mode_num > __FSCRYPT_MODE_MAX)) return -EINVAL; - /* pairs with cmpxchg() below */ + /* pairs with smp_store_release() below */ tfm = READ_ONCE(tfms[mode_num]); - if (likely(tfm != NULL)) - goto done; + if (likely(tfm != NULL)) { + ci->ci_ctfm = tfm; + return 0; + } + + mutex_lock(&fscrypt_mode_key_setup_mutex); + + if (tfms[mode_num]) + goto done_unlock; BUILD_BUG_ON(sizeof(mode_num) != 1); BUILD_BUG_ON(sizeof(sb->s_uuid) != 16); @@ -157,21 +166,21 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci, hkdf_context, hkdf_info, hkdf_infolen, mode_key, mode->keysize); if (err) - return err; + goto out_unlock; tfm = fscrypt_allocate_skcipher(mode, mode_key, inode); memzero_explicit(mode_key, mode->keysize); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - /* pairs with READ_ONCE() above */ - prev_tfm = cmpxchg(&tfms[mode_num], NULL, tfm); - if (prev_tfm != NULL) { - crypto_free_skcipher(tfm); - tfm = prev_tfm; + if (IS_ERR(tfm)) { + err = PTR_ERR(tfm); + goto out_unlock; } -done: + /* pairs with READ_ONCE() above */ + smp_store_release(&tfms[mode_num], tfm); +done_unlock: ci->ci_ctfm = tfm; - return 0; + err = 0; +out_unlock: + mutex_unlock(&fscrypt_mode_key_setup_mutex); + return err; } int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, @@ -189,6 +198,43 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, return 0; } +static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_info *ci, + struct fscrypt_master_key *mk) +{ + int err; + + err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_32_keys, + HKDF_CONTEXT_IV_INO_LBLK_32_KEY, true); + if (err) + return err; + + /* pairs with smp_store_release() below */ + if (!smp_load_acquire(&mk->mk_ino_hash_key_initialized)) { + + mutex_lock(&fscrypt_mode_key_setup_mutex); + + if (mk->mk_ino_hash_key_initialized) + goto unlock; + + err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, + HKDF_CONTEXT_INODE_HASH_KEY, NULL, 0, + (u8 *)&mk->mk_ino_hash_key, + sizeof(mk->mk_ino_hash_key)); + if (err) + goto unlock; + /* pairs with smp_load_acquire() above */ + smp_store_release(&mk->mk_ino_hash_key_initialized, true); +unlock: + mutex_unlock(&fscrypt_mode_key_setup_mutex); + if (err) + return err; + } + + ci->ci_hashed_ino = (u32)siphash_1u64(ci->ci_inode->i_ino, + &mk->mk_ino_hash_key); + return 0; +} + static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, struct fscrypt_master_key *mk) { @@ -203,7 +249,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, * encryption key. This ensures that the master key is * consistently used only for HKDF, avoiding key reuse issues. */ - err = setup_per_mode_enc_key(ci, mk, mk->mk_direct_tfms, + err = setup_per_mode_enc_key(ci, mk, mk->mk_direct_keys, HKDF_CONTEXT_DIRECT_KEY, false); } else if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) { @@ -211,11 +257,14 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, * IV_INO_LBLK_64: encryption keys are derived from (master_key, * mode_num, filesystem_uuid), and inode number is included in * the IVs. This format is optimized for use with inline - * encryption hardware compliant with the UFS or eMMC standards. + * encryption hardware compliant with the UFS standard. */ - err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_64_tfms, + err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_64_keys, HKDF_CONTEXT_IV_INO_LBLK_64_KEY, true); + } else if (ci->ci_policy.v2.flags & + FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) { + err = fscrypt_setup_iv_ino_lblk_32_key(ci, mk); } else { u8 derived_key[FSCRYPT_MAX_KEY_SIZE]; diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index a15aec8e068c..d23ff162c78b 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -69,18 +69,14 @@ static bool supported_direct_key_modes(const struct inode *inode, return true; } -static bool supported_iv_ino_lblk_64_policy( - const struct fscrypt_policy_v2 *policy, - const struct inode *inode) +static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy, + const struct inode *inode, + const char *type, + int max_ino_bits, int max_lblk_bits) { struct super_block *sb = inode->i_sb; int ino_bits = 64, lblk_bits = 64; - if (policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) { - fscrypt_warn(inode, - "The DIRECT_KEY and IV_INO_LBLK_64 flags are mutually exclusive"); - return false; - } /* * It's unsafe to include inode numbers in the IVs if the filesystem can * potentially renumber inodes, e.g. via filesystem shrinking. @@ -88,16 +84,22 @@ static bool supported_iv_ino_lblk_64_policy( if (!sb->s_cop->has_stable_inodes || !sb->s_cop->has_stable_inodes(sb)) { fscrypt_warn(inode, - "Can't use IV_INO_LBLK_64 policy on filesystem '%s' because it doesn't have stable inode numbers", - sb->s_id); + "Can't use %s policy on filesystem '%s' because it doesn't have stable inode numbers", + type, sb->s_id); return false; } if (sb->s_cop->get_ino_and_lblk_bits) sb->s_cop->get_ino_and_lblk_bits(sb, &ino_bits, &lblk_bits); - if (ino_bits > 32 || lblk_bits > 32) { + if (ino_bits > max_ino_bits) { fscrypt_warn(inode, - "Can't use IV_INO_LBLK_64 policy on filesystem '%s' because it doesn't use 32-bit inode and block numbers", - sb->s_id); + "Can't use %s policy on filesystem '%s' because its inode numbers are too long", + type, sb->s_id); + return false; + } + if (lblk_bits > max_lblk_bits) { + fscrypt_warn(inode, + "Can't use %s policy on filesystem '%s' because its block numbers are too long", + type, sb->s_id); return false; } return true; @@ -140,6 +142,8 @@ static bool fscrypt_supported_v1_policy(const struct fscrypt_policy_v1 *policy, static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, const struct inode *inode) { + int count = 0; + if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, policy->filenames_encryption_mode)) { fscrypt_warn(inode, @@ -155,13 +159,29 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, return false; } + count += !!(policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY); + count += !!(policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64); + count += !!(policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32); + if (count > 1) { + fscrypt_warn(inode, "Mutually exclusive encryption flags (0x%02x)", + policy->flags); + return false; + } + if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) && !supported_direct_key_modes(inode, policy->contents_encryption_mode, policy->filenames_encryption_mode)) return false; if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) && - !supported_iv_ino_lblk_64_policy(policy, inode)) + !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_64", + 32, 32)) + return false; + + if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) && + /* This uses hashed inode numbers, so ino_bits doesn't matter. */ + !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_32", + INT_MAX, 32)) return false; if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) { @@ -366,6 +386,9 @@ static int set_encryption_policy(struct inode *inode, policy->v2.master_key_identifier); if (err) return err; + if (policy->v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) + pr_warn_once("%s (pid %d) is setting an IV_INO_LBLK_32 encryption policy. This should only be used if there are certain hardware limitations.\n", + current->comm, current->pid); break; default: WARN_ON(1); diff --git a/include/uapi/linux/fscrypt.h b/include/uapi/linux/fscrypt.h index a10e3cdc2839..7875709ccfeb 100644 --- a/include/uapi/linux/fscrypt.h +++ b/include/uapi/linux/fscrypt.h @@ -19,7 +19,8 @@ #define FSCRYPT_POLICY_FLAGS_PAD_MASK 0x03 #define FSCRYPT_POLICY_FLAG_DIRECT_KEY 0x04 #define FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 0x08 -#define FSCRYPT_POLICY_FLAGS_VALID 0x0F +#define FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32 0x10 +#define FSCRYPT_POLICY_FLAGS_VALID 0x1F /* Encryption algorithms */ #define FSCRYPT_MODE_AES_256_XTS 1 From 27e5041a87e8af2d0b6452dffe053d0253e914cc Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 May 2020 19:43:41 -0700 Subject: [PATCH 16/43] pstore: Add locking around superblock changes Nothing was protecting changes to the pstorefs superblock. Add locking and refactor away is_pstore_mounted(), instead using a helper to add a way to safely lock the pstorefs root inode during filesystem changes. Link: https://lore.kernel.org/lkml/20200506152114.50375-9-keescook@chromium.org/ Signed-off-by: Kees Cook --- fs/pstore/inode.c | 65 +++++++++++++++++++++++++++++--------------- fs/pstore/internal.h | 1 - fs/pstore/platform.c | 5 ++-- 3 files changed, 45 insertions(+), 26 deletions(-) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 5f08b21b7a46..0e76e12fa6d1 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -31,6 +31,9 @@ static DEFINE_MUTEX(records_list_lock); static LIST_HEAD(records_list); +static DEFINE_MUTEX(pstore_sb_lock); +static struct super_block *pstore_sb; + struct pstore_private { struct list_head list; struct pstore_record *record; @@ -282,11 +285,25 @@ static const struct super_operations pstore_ops = { .show_options = pstore_show_options, }; -static struct super_block *pstore_sb; - -bool pstore_is_mounted(void) +static struct dentry *psinfo_lock_root(void) { - return pstore_sb != NULL; + struct dentry *root; + + mutex_lock(&pstore_sb_lock); + /* + * Having no backend is fine -- no records appear. + * Not being mounted is fine -- nothing to do. + */ + if (!psinfo || !pstore_sb) { + mutex_unlock(&pstore_sb_lock); + return NULL; + } + + root = pstore_sb->s_root; + inode_lock(d_inode(root)); + mutex_unlock(&pstore_sb_lock); + + return root; } /* @@ -303,20 +320,18 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) struct pstore_private *private, *pos; size_t size = record->size + record->ecc_notice_size; - WARN_ON(!inode_is_locked(d_inode(root))); + if (WARN_ON(!inode_is_locked(d_inode(root)))) + return -EINVAL; + rc = -EEXIST; + /* Skip records that are already present in the filesystem. */ mutex_lock(&records_list_lock); list_for_each_entry(pos, &records_list, list) { if (pos->record->type == record->type && pos->record->id == record->id && - pos->record->psi == record->psi) { - rc = -EEXIST; - break; - } + pos->record->psi == record->psi) + goto fail; } - mutex_unlock(&records_list_lock); - if (rc) - return rc; rc = -ENOMEM; inode = pstore_get_inode(root->d_sb); @@ -346,7 +361,6 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) d_add(dentry, inode); - mutex_lock(&records_list_lock); list_add(&private->list, &records_list); mutex_unlock(&records_list_lock); @@ -356,8 +370,8 @@ fail_private: free_pstore_private(private); fail_inode: iput(inode); - fail: + mutex_unlock(&records_list_lock); return rc; } @@ -369,16 +383,13 @@ fail: */ void pstore_get_records(int quiet) { - struct pstore_info *psi = psinfo; struct dentry *root; - if (!psi || !pstore_sb) + root = psinfo_lock_root(); + if (!root) return; - root = pstore_sb->s_root; - - inode_lock(d_inode(root)); - pstore_get_backend_records(psi, root, quiet); + pstore_get_backend_records(psinfo, root, quiet); inode_unlock(d_inode(root)); } @@ -386,8 +397,6 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; - pstore_sb = sb; - sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; @@ -408,6 +417,10 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent) if (!sb->s_root) return -ENOMEM; + mutex_lock(&pstore_sb_lock); + pstore_sb = sb; + mutex_unlock(&pstore_sb_lock); + pstore_get_records(0); return 0; @@ -421,9 +434,17 @@ static struct dentry *pstore_mount(struct file_system_type *fs_type, static void pstore_kill_sb(struct super_block *sb) { + mutex_lock(&pstore_sb_lock); + WARN_ON(pstore_sb != sb); + kill_litter_super(sb); pstore_sb = NULL; + + mutex_lock(&records_list_lock); INIT_LIST_HEAD(&records_list); + mutex_unlock(&records_list_lock); + + mutex_unlock(&pstore_sb_lock); } static struct file_system_type pstore_fs_type = { diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 7062ea4bc57c..fe5f7ef7323f 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -33,7 +33,6 @@ extern void pstore_get_backend_records(struct pstore_info *psi, struct dentry *root, int quiet); extern int pstore_mkfile(struct dentry *root, struct pstore_record *record); -extern bool pstore_is_mounted(void); extern void pstore_record_init(struct pstore_record *record, struct pstore_info *psi); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 8c0076a1f896..624e100fbeca 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -460,7 +460,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, } ret = psinfo->write(&record); - if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) + if (ret == 0 && reason == KMSG_DUMP_OOPS) pstore_new_entry = 1; total += record.size; @@ -592,8 +592,7 @@ int pstore_register(struct pstore_info *psi) if (psi->flags & PSTORE_FLAGS_DMESG) allocate_buf_for_compression(); - if (pstore_is_mounted()) - pstore_get_records(0); + pstore_get_records(0); if (psi->flags & PSTORE_FLAGS_DMESG) pstore_register_kmsg(); From 78c83c828c043f2e18929137c1e218e8977349b1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 5 May 2020 21:36:15 -0700 Subject: [PATCH 17/43] pstore: Do not leave timer disabled for next backend The pstore.update_ms value was being disabled during pstore_unregister(), which would cause any prior value to go unnoticed on the next pstore_register(). Instead, just let del_timer() stop the timer, which was always sufficient. This additionally refactors the timer reset code and allows the timer to be enabled if the module parameter is changed away from the default. Link: https://lore.kernel.org/lkml/20200506152114.50375-10-keescook@chromium.org/ Signed-off-by: Kees Cook --- fs/pstore/platform.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 624e100fbeca..327ee70e881d 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -44,7 +44,7 @@ static int pstore_update_ms = -1; module_param_named(update_ms, pstore_update_ms, int, 0600); MODULE_PARM_DESC(update_ms, "milliseconds before pstore updates its content " "(default is -1, which means runtime updates are disabled; " - "enabling this option is not safe, it may lead to further " + "enabling this option may not be safe; it may lead to further " "corruption on Oopses)"); /* Names should be in the same order as the enum pstore_type_id */ @@ -150,6 +150,14 @@ static const char *get_reason_str(enum kmsg_dump_reason reason) } } +static void pstore_timer_kick(void) +{ + if (pstore_update_ms < 0) + return; + + mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms)); +} + /* * Should pstore_dump() wait for a concurrent pstore_dump()? If * not, the current pstore_dump() will report a failure to dump @@ -460,8 +468,10 @@ static void pstore_dump(struct kmsg_dumper *dumper, } ret = psinfo->write(&record); - if (ret == 0 && reason == KMSG_DUMP_OOPS) + if (ret == 0 && reason == KMSG_DUMP_OOPS) { pstore_new_entry = 1; + pstore_timer_kick(); + } total += record.size; part++; @@ -604,11 +614,7 @@ int pstore_register(struct pstore_info *psi) pstore_register_pmsg(); /* Start watching for new records, if desired. */ - if (pstore_update_ms >= 0) { - pstore_timer.expires = jiffies + - msecs_to_jiffies(pstore_update_ms); - add_timer(&pstore_timer); - } + pstore_timer_kick(); /* * Update the module parameter backend, so it is visible @@ -637,11 +643,7 @@ void pstore_unregister(struct pstore_info *psi) return; } - /* Stop timer and make sure all work has finished. */ - pstore_update_ms = -1; - del_timer_sync(&pstore_timer); - flush_work(&pstore_work); - + /* Unregister all callbacks. */ if (psi->flags & PSTORE_FLAGS_PMSG) pstore_unregister_pmsg(); if (psi->flags & PSTORE_FLAGS_FTRACE) @@ -651,6 +653,10 @@ void pstore_unregister(struct pstore_info *psi) if (psi->flags & PSTORE_FLAGS_DMESG) pstore_unregister_kmsg(); + /* Stop timer and make sure all work has finished. */ + del_timer_sync(&pstore_timer); + flush_work(&pstore_work); + free_buf_for_compression(); psinfo = NULL; @@ -792,9 +798,7 @@ static void pstore_timefunc(struct timer_list *unused) schedule_work(&pstore_work); } - if (pstore_update_ms >= 0) - mod_timer(&pstore_timer, - jiffies + msecs_to_jiffies(pstore_update_ms)); + pstore_timer_kick(); } static void __init pstore_choose_compression(void) From 609e28bb139e53621521130f0d4aea27a725d465 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 May 2020 19:46:53 -0700 Subject: [PATCH 18/43] pstore: Remove filesystem records when backend is unregistered If a backend was unloaded without having first removed all its associated records in pstorefs, subsequent removals would crash while attempting to call into the now missing backend. Add automatic removal from the tree in pstore_unregister(), so that no references to the backend remain. Reported-by: Luis Henriques Link: https://lore.kernel.org/lkml/87o8yrmv69.fsf@suse.com Link: https://lore.kernel.org/lkml/20200506152114.50375-11-keescook@chromium.org/ Signed-off-by: Kees Cook --- fs/pstore/inode.c | 32 ++++++++++++++++++++++++++++++++ fs/pstore/internal.h | 1 + fs/pstore/platform.c | 3 +++ 3 files changed, 36 insertions(+) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 0e76e12fa6d1..c331efe8de95 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -36,6 +36,7 @@ static struct super_block *pstore_sb; struct pstore_private { struct list_head list; + struct dentry *dentry; struct pstore_record *record; size_t total_size; }; @@ -191,6 +192,7 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) list_del_init(&p->list); else rc = -ENOENT; + p->dentry = NULL; mutex_unlock(&records_list_lock); if (rc) return rc; @@ -306,6 +308,35 @@ static struct dentry *psinfo_lock_root(void) return root; } +int pstore_put_backend_records(struct pstore_info *psi) +{ + struct pstore_private *pos, *tmp; + struct dentry *root; + int rc = 0; + + root = psinfo_lock_root(); + if (!root) + return 0; + + mutex_lock(&records_list_lock); + list_for_each_entry_safe(pos, tmp, &records_list, list) { + if (pos->record->psi == psi) { + list_del_init(&pos->list); + rc = simple_unlink(d_inode(root), pos->dentry); + if (WARN_ON(rc)) + break; + d_drop(pos->dentry); + dput(pos->dentry); + pos->dentry = NULL; + } + } + mutex_unlock(&records_list_lock); + + inode_unlock(d_inode(root)); + + return rc; +} + /* * Make a regular file in the root directory of our file system. * Load it up with "size" bytes of data from "buf". @@ -352,6 +383,7 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) if (!dentry) goto fail_private; + private->dentry = dentry; private->record = record; inode->i_size = private->total_size = size; inode->i_private = private; diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index fe5f7ef7323f..8efd72d93b10 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -31,6 +31,7 @@ extern void pstore_set_kmsg_bytes(int); extern void pstore_get_records(int); extern void pstore_get_backend_records(struct pstore_info *psi, struct dentry *root, int quiet); +extern int pstore_put_backend_records(struct pstore_info *psi); extern int pstore_mkfile(struct dentry *root, struct pstore_record *record); extern void pstore_record_init(struct pstore_record *record, diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 327ee70e881d..398785ab059f 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -657,6 +657,9 @@ void pstore_unregister(struct pstore_info *psi) del_timer_sync(&pstore_timer); flush_work(&pstore_work); + /* Remove all backend records from filesystem tree. */ + pstore_put_backend_records(psi); + free_buf_for_compression(); psinfo = NULL; From b7753fc7f6f5626e51ee78156fd801fb52163af0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 6 May 2020 16:34:42 -0700 Subject: [PATCH 19/43] pstore: Make sure console capturing will restart The CON_ENABLED flag gets cleared during unregister_console(), so make sure we already reset the console flags before calling register_console(), otherwise unloading and reloading a pstore backend will not restart console logging. Signed-off-by: Kees Cook --- fs/pstore/platform.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 398785ab059f..8beaeff72386 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -516,12 +516,16 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) static struct console pstore_console = { .name = "pstore", .write = pstore_console_write, - .flags = CON_PRINTBUFFER | CON_ENABLED | CON_ANYTIME, .index = -1, }; static void pstore_register_console(void) { + /* + * Always initialize flags here since prior unregister_console() + * calls may have changed settings (specifically CON_ENABLED). + */ + pstore_console.flags = CON_PRINTBUFFER | CON_ENABLED | CON_ANYTIME; register_console(&pstore_console); } From 563ca40ddf400dbf8c6254077f9b6887101d0f08 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 8 May 2020 09:16:02 -0700 Subject: [PATCH 20/43] pstore/platform: Switch pstore_info::name to const In order to more cleanly pass around backend names, make the "name" member const. This means the module param needs to be dynamic (technically, it was before, so this actually cleans up a minor memory leak if a backend was specified and then gets unloaded.) Link: https://lore.kernel.org/lkml/20200510202436.63222-3-keescook@chromium.org/ Signed-off-by: Kees Cook --- fs/pstore/platform.c | 3 ++- include/linux/pstore.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 8beaeff72386..715396bef0ea 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -624,7 +624,7 @@ int pstore_register(struct pstore_info *psi) * Update the module parameter backend, so it is visible * through /sys/module/pstore/parameters/backend */ - backend = psi->name; + backend = kstrdup(psi->name, GFP_KERNEL); pr_info("Registered %s as persistent store backend\n", psi->name); @@ -667,6 +667,7 @@ void pstore_unregister(struct pstore_info *psi) free_buf_for_compression(); psinfo = NULL; + kfree(backend); backend = NULL; mutex_unlock(&psinfo_lock); } diff --git a/include/linux/pstore.h b/include/linux/pstore.h index e779441e6d26..f6f22b13e04f 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -170,7 +170,7 @@ struct pstore_record { */ struct pstore_info { struct module *owner; - char *name; + const char *name; struct semaphore buf_lock; char *buf; From d195c39052d1da278a00a6744ce59c383b67b191 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 8 May 2020 09:26:28 -0700 Subject: [PATCH 21/43] pstore/platform: Use backend name for console registration If the pstore backend changes, there's no indication in the logs what the console is (it always says "pstore"). Instead, pass through the active backend's name. (Also adjust the selftest to match.) Link: https://lore.kernel.org/lkml/20200510202436.63222-5-keescook@chromium.org/ Link: https://lore.kernel.org/lkml/20200526135429.GQ12456@shao2-debian Signed-off-by: Kees Cook --- fs/pstore/platform.c | 4 +++- tools/testing/selftests/pstore/pstore_tests | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 715396bef0ea..e8690d8606e0 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -514,13 +514,15 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) } static struct console pstore_console = { - .name = "pstore", .write = pstore_console_write, .index = -1, }; static void pstore_register_console(void) { + /* Show which backend is going to get console writes. */ + strscpy(pstore_console.name, psinfo->name, + sizeof(pstore_console.name)); /* * Always initialize flags here since prior unregister_console() * calls may have changed settings (specifically CON_ENABLED). diff --git a/tools/testing/selftests/pstore/pstore_tests b/tools/testing/selftests/pstore/pstore_tests index 1cef54458aff..2aa9a3852a84 100755 --- a/tools/testing/selftests/pstore/pstore_tests +++ b/tools/testing/selftests/pstore/pstore_tests @@ -10,7 +10,7 @@ . ./common_tests prlog -n "Checking pstore console is registered ... " -dmesg | grep -q "console \[pstore" +dmesg | grep -Eq "console \[(pstore|${backend})" show_result $? prlog -n "Checking /dev/pmsg0 exists ... " From d973f7d83dc7360597373536b34e80eea306d15f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 8 May 2020 09:25:19 -0700 Subject: [PATCH 22/43] pstore/platform: Move module params after declarations It is easier to see how module params are used if they're near the variables they use. Link: https://lore.kernel.org/lkml/20200510202436.63222-4-keescook@chromium.org/ Signed-off-by: Kees Cook --- fs/pstore/platform.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index e8690d8606e0..072440457c08 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -77,12 +77,17 @@ static DEFINE_MUTEX(psinfo_lock); struct pstore_info *psinfo; static char *backend; +module_param(backend, charp, 0444); +MODULE_PARM_DESC(backend, "specific backend to use"); + static char *compress = #ifdef CONFIG_PSTORE_COMPRESS_DEFAULT CONFIG_PSTORE_COMPRESS_DEFAULT; #else NULL; #endif +module_param(compress, charp, 0444); +MODULE_PARM_DESC(compress, "compression to use"); /* Compression parameters */ static struct crypto_comp *tfm; @@ -853,11 +858,5 @@ static void __exit pstore_exit(void) } module_exit(pstore_exit) -module_param(compress, charp, 0444); -MODULE_PARM_DESC(compress, "Pstore compression to use"); - -module_param(backend, charp, 0444); -MODULE_PARM_DESC(backend, "Pstore backend to use"); - MODULE_AUTHOR("Tony Luck "); MODULE_LICENSE("GPL"); From f858b57f7dd28b5bb55e9dc3776fff58f38a7c91 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 6 May 2020 13:06:18 -0700 Subject: [PATCH 23/43] pstore/ram: Adjust module param permissions to reflect reality A couple module parameters had 0600 permissions, but changing them would have no impact on ramoops, so switch these to 0400 to reflect reality. Link: https://lore.kernel.org/lkml/20200506211523.15077-7-keescook@chromium.org/ Signed-off-by: Kees Cook --- fs/pstore/ram.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 795622190c01..3b399f72e495 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -53,17 +53,17 @@ MODULE_PARM_DESC(mem_size, "size of reserved RAM used to store oops/panic logs"); static unsigned int mem_type; -module_param(mem_type, uint, 0600); +module_param(mem_type, uint, 0400); MODULE_PARM_DESC(mem_type, "set to 1 to try to use unbuffered memory (default 0)"); static int dump_oops = 1; -module_param(dump_oops, int, 0600); +module_param(dump_oops, int, 0400); MODULE_PARM_DESC(dump_oops, "set to 1 to dump oopses, 0 to only dump panics (default 1)"); static int ramoops_ecc; -module_param_named(ecc, ramoops_ecc, int, 0600); +module_param_named(ecc, ramoops_ecc, int, 0400); MODULE_PARM_DESC(ramoops_ecc, "if non-zero, the option enables ECC support and specifies " "ECC buffer size in bytes (1 is a special value, means 16 " From 26961d76ff35aecb1af0fefd283fb5170786a812 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 6 May 2020 11:44:14 -0700 Subject: [PATCH 24/43] pstore/ram: Refactor DT size parsing Refactor device tree size parsing routines to be able to pass a non-zero default value for providing a configurable default for the coming "max_reason" field. Also rename the helpers, since we're not always parsing a size -- we're parsing a u32 and making sure it's not greater than INT_MAX. Link: https://lore.kernel.org/lkml/20200506211523.15077-4-keescook@chromium.org/ Link: https://lore.kernel.org/lkml/20200521205223.175957-1-tyhicks@linux.microsoft.com Signed-off-by: Kees Cook --- fs/pstore/ram.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 3b399f72e495..8cdafb686736 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -644,19 +644,25 @@ static int ramoops_init_prz(const char *name, return 0; } -static int ramoops_parse_dt_size(struct platform_device *pdev, - const char *propname, u32 *value) +/* Read a u32 from a dt property and make sure it's safe for an int. */ +static int ramoops_parse_dt_u32(struct platform_device *pdev, + const char *propname, + u32 default_value, u32 *value) { u32 val32 = 0; int ret; ret = of_property_read_u32(pdev->dev.of_node, propname, &val32); - if (ret < 0 && ret != -EINVAL) { + if (ret == -EINVAL) { + /* field is missing, use default value. */ + val32 = default_value; + } else if (ret < 0) { dev_err(&pdev->dev, "failed to parse property %s: %d\n", propname, ret); return ret; } + /* Sanity check our results. */ if (val32 > INT_MAX) { dev_err(&pdev->dev, "%s %u > INT_MAX\n", propname, val32); return -EOVERFLOW; @@ -689,21 +695,22 @@ static int ramoops_parse_dt(struct platform_device *pdev, pdata->mem_type = of_property_read_bool(of_node, "unbuffered"); pdata->dump_oops = !of_property_read_bool(of_node, "no-dump-oops"); -#define parse_size(name, field) { \ - ret = ramoops_parse_dt_size(pdev, name, &value); \ +#define parse_u32(name, field, default_value) { \ + ret = ramoops_parse_dt_u32(pdev, name, default_value, \ + &value); \ if (ret < 0) \ return ret; \ field = value; \ } - parse_size("record-size", pdata->record_size); - parse_size("console-size", pdata->console_size); - parse_size("ftrace-size", pdata->ftrace_size); - parse_size("pmsg-size", pdata->pmsg_size); - parse_size("ecc-size", pdata->ecc_info.ecc_size); - parse_size("flags", pdata->flags); + parse_u32("record-size", pdata->record_size, 0); + parse_u32("console-size", pdata->console_size, 0); + parse_u32("ftrace-size", pdata->ftrace_size, 0); + parse_u32("pmsg-size", pdata->pmsg_size, 0); + parse_u32("ecc-size", pdata->ecc_info.ecc_size, 0); + parse_u32("flags", pdata->flags, 0); -#undef parse_size +#undef parse_u32 /* * Some old Chromebooks relied on the kernel setting the From df9bf19d88965758c700f46ef75110336fea8654 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 8 May 2020 08:34:38 -0700 Subject: [PATCH 25/43] pstore/ram: Refactor ftrace buffer merging This changes the ftrace record merging code to be agnostic of pstore/ram, as the first step to making it available as a generic routine for other backends to use, such as pstore/zone. Link: https://lore.kernel.org/lkml/20200510202436.63222-6-keescook@chromium.org/ Signed-off-by: Kees Cook --- fs/pstore/ram.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 8cdafb686736..dba9b4e3dc3f 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -168,8 +168,9 @@ static bool prz_ok(struct persistent_ram_zone *prz) persistent_ram_ecc_string(prz, NULL, 0)); } -static ssize_t ftrace_log_combine(struct persistent_ram_zone *dest, - struct persistent_ram_zone *src) +static +ssize_t ftrace_log_combine(char **dest_log, size_t *dest_log_size, + const char *src_log, size_t src_log_size) { size_t dest_size, src_size, total, dest_off, src_off; size_t dest_idx = 0, src_idx = 0, merged_idx = 0; @@ -177,19 +178,19 @@ static ssize_t ftrace_log_combine(struct persistent_ram_zone *dest, struct pstore_ftrace_record *drec, *srec, *mrec; size_t record_size = sizeof(struct pstore_ftrace_record); - dest_off = dest->old_log_size % record_size; - dest_size = dest->old_log_size - dest_off; + dest_off = *dest_log_size % record_size; + dest_size = *dest_log_size - dest_off; - src_off = src->old_log_size % record_size; - src_size = src->old_log_size - src_off; + src_off = src_log_size % record_size; + src_size = src_log_size - src_off; total = dest_size + src_size; merged_buf = kmalloc(total, GFP_KERNEL); if (!merged_buf) return -ENOMEM; - drec = (struct pstore_ftrace_record *)(dest->old_log + dest_off); - srec = (struct pstore_ftrace_record *)(src->old_log + src_off); + drec = (struct pstore_ftrace_record *)(*dest_log + dest_off); + srec = (struct pstore_ftrace_record *)(src_log + src_off); mrec = (struct pstore_ftrace_record *)(merged_buf); while (dest_size > 0 && src_size > 0) { @@ -213,9 +214,9 @@ static ssize_t ftrace_log_combine(struct persistent_ram_zone *dest, src_size -= record_size; } - kfree(dest->old_log); - dest->old_log = merged_buf; - dest->old_log_size = total; + kfree(*dest_log); + *dest_log = merged_buf; + *dest_log_size = total; return 0; } @@ -291,7 +292,11 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record) tmp_prz->corrected_bytes += prz_next->corrected_bytes; tmp_prz->bad_blocks += prz_next->bad_blocks; - size = ftrace_log_combine(tmp_prz, prz_next); + + size = ftrace_log_combine(&tmp_prz->old_log, + &tmp_prz->old_log_size, + prz_next->old_log, + prz_next->old_log_size); if (size) goto out; } From 16a583079e937f6f5e6274ef7fda6dbf7dcb669f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 8 May 2020 08:42:12 -0700 Subject: [PATCH 26/43] pstore/ftrace: Provide ftrace log merging routine Move the ftrace log merging logic out of pstore/ram into pstore/ftrace so other backends can use it, like pstore/zone. Link: https://lore.kernel.org/lkml/20200510202436.63222-7-keescook@chromium.org/ Signed-off-by: Kees Cook --- fs/pstore/ftrace.c | 54 +++++++++++++++++++++++++++++++++++++++++ fs/pstore/internal.h | 9 +++++++ fs/pstore/ram.c | 57 +++----------------------------------------- 3 files changed, 66 insertions(+), 54 deletions(-) diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c index bfbfc2698070..5c0450701293 100644 --- a/fs/pstore/ftrace.c +++ b/fs/pstore/ftrace.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "internal.h" @@ -132,3 +133,56 @@ void pstore_unregister_ftrace(void) debugfs_remove_recursive(pstore_ftrace_dir); } + +ssize_t pstore_ftrace_combine_log(char **dest_log, size_t *dest_log_size, + const char *src_log, size_t src_log_size) +{ + size_t dest_size, src_size, total, dest_off, src_off; + size_t dest_idx = 0, src_idx = 0, merged_idx = 0; + void *merged_buf; + struct pstore_ftrace_record *drec, *srec, *mrec; + size_t record_size = sizeof(struct pstore_ftrace_record); + + dest_off = *dest_log_size % record_size; + dest_size = *dest_log_size - dest_off; + + src_off = src_log_size % record_size; + src_size = src_log_size - src_off; + + total = dest_size + src_size; + merged_buf = kmalloc(total, GFP_KERNEL); + if (!merged_buf) + return -ENOMEM; + + drec = (struct pstore_ftrace_record *)(*dest_log + dest_off); + srec = (struct pstore_ftrace_record *)(src_log + src_off); + mrec = (struct pstore_ftrace_record *)(merged_buf); + + while (dest_size > 0 && src_size > 0) { + if (pstore_ftrace_read_timestamp(&drec[dest_idx]) < + pstore_ftrace_read_timestamp(&srec[src_idx])) { + mrec[merged_idx++] = drec[dest_idx++]; + dest_size -= record_size; + } else { + mrec[merged_idx++] = srec[src_idx++]; + src_size -= record_size; + } + } + + while (dest_size > 0) { + mrec[merged_idx++] = drec[dest_idx++]; + dest_size -= record_size; + } + + while (src_size > 0) { + mrec[merged_idx++] = srec[src_idx++]; + src_size -= record_size; + } + + kfree(*dest_log); + *dest_log = merged_buf; + *dest_log_size = total; + + return 0; +} +EXPORT_SYMBOL_GPL(pstore_ftrace_combine_log); diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 8efd72d93b10..7fb219042f13 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -12,9 +12,18 @@ extern unsigned long kmsg_bytes; #ifdef CONFIG_PSTORE_FTRACE extern void pstore_register_ftrace(void); extern void pstore_unregister_ftrace(void); +ssize_t pstore_ftrace_combine_log(char **dest_log, size_t *dest_log_size, + const char *src_log, size_t src_log_size); #else static inline void pstore_register_ftrace(void) {} static inline void pstore_unregister_ftrace(void) {} +static inline ssize_t +pstore_ftrace_combine_log(char **dest_log, size_t *dest_log_size, + const char *src_log, size_t src_log_size) +{ + *dest_log_size = 0; + return 0; +} #endif #ifdef CONFIG_PSTORE_PMSG diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index dba9b4e3dc3f..26a1f502a3ea 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -21,6 +21,7 @@ #include #include #include +#include "internal.h" #define RAMOOPS_KERNMSG_HDR "====" #define MIN_MEM_SIZE 4096UL @@ -168,59 +169,6 @@ static bool prz_ok(struct persistent_ram_zone *prz) persistent_ram_ecc_string(prz, NULL, 0)); } -static -ssize_t ftrace_log_combine(char **dest_log, size_t *dest_log_size, - const char *src_log, size_t src_log_size) -{ - size_t dest_size, src_size, total, dest_off, src_off; - size_t dest_idx = 0, src_idx = 0, merged_idx = 0; - void *merged_buf; - struct pstore_ftrace_record *drec, *srec, *mrec; - size_t record_size = sizeof(struct pstore_ftrace_record); - - dest_off = *dest_log_size % record_size; - dest_size = *dest_log_size - dest_off; - - src_off = src_log_size % record_size; - src_size = src_log_size - src_off; - - total = dest_size + src_size; - merged_buf = kmalloc(total, GFP_KERNEL); - if (!merged_buf) - return -ENOMEM; - - drec = (struct pstore_ftrace_record *)(*dest_log + dest_off); - srec = (struct pstore_ftrace_record *)(src_log + src_off); - mrec = (struct pstore_ftrace_record *)(merged_buf); - - while (dest_size > 0 && src_size > 0) { - if (pstore_ftrace_read_timestamp(&drec[dest_idx]) < - pstore_ftrace_read_timestamp(&srec[src_idx])) { - mrec[merged_idx++] = drec[dest_idx++]; - dest_size -= record_size; - } else { - mrec[merged_idx++] = srec[src_idx++]; - src_size -= record_size; - } - } - - while (dest_size > 0) { - mrec[merged_idx++] = drec[dest_idx++]; - dest_size -= record_size; - } - - while (src_size > 0) { - mrec[merged_idx++] = srec[src_idx++]; - src_size -= record_size; - } - - kfree(*dest_log); - *dest_log = merged_buf; - *dest_log_size = total; - - return 0; -} - static ssize_t ramoops_pstore_read(struct pstore_record *record) { ssize_t size = 0; @@ -293,7 +241,8 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record) prz_next->corrected_bytes; tmp_prz->bad_blocks += prz_next->bad_blocks; - size = ftrace_log_combine(&tmp_prz->old_log, + size = pstore_ftrace_combine_log( + &tmp_prz->old_log, &tmp_prz->old_log_size, prz_next->old_log, prz_next->old_log_size); From 6d3cf962dd1a95df868c547b090bfc4c7977f4be Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 15 May 2020 11:05:43 -0700 Subject: [PATCH 27/43] printk: Collapse shutdown types into a single dump reason To turn the KMSG_DUMP_* reasons into a more ordered list, collapse the redundant KMSG_DUMP_(RESTART|HALT|POWEROFF) reasons into KMSG_DUMP_SHUTDOWN. The current users already don't meaningfully distinguish between them, so there's no need to, as discussed here: https://lore.kernel.org/lkml/CA+CK2bAPv5u1ih5y9t5FUnTyximtFCtDYXJCpuyjOyHNOkRdqw@mail.gmail.com/ Link: https://lore.kernel.org/lkml/20200515184434.8470-2-keescook@chromium.org/ Reviewed-by: Pavel Tatashin Reviewed-by: Petr Mladek Acked-by: Michael Ellerman (powerpc) Signed-off-by: Kees Cook --- arch/powerpc/kernel/nvram_64.c | 4 +--- fs/pstore/platform.c | 8 ++------ include/linux/kmsg_dump.h | 4 +--- kernel/reboot.c | 6 +++--- 4 files changed, 7 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index fb4f61096613..0cd1c88bfc8b 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -655,9 +655,7 @@ static void oops_to_nvram(struct kmsg_dumper *dumper, int rc = -1; switch (reason) { - case KMSG_DUMP_RESTART: - case KMSG_DUMP_HALT: - case KMSG_DUMP_POWEROFF: + case KMSG_DUMP_SHUTDOWN: /* These are almost always orderly shutdowns. */ return; case KMSG_DUMP_OOPS: diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 072440457c08..90d74ebaa70a 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -144,12 +144,8 @@ static const char *get_reason_str(enum kmsg_dump_reason reason) return "Oops"; case KMSG_DUMP_EMERG: return "Emergency"; - case KMSG_DUMP_RESTART: - return "Restart"; - case KMSG_DUMP_HALT: - return "Halt"; - case KMSG_DUMP_POWEROFF: - return "Poweroff"; + case KMSG_DUMP_SHUTDOWN: + return "Shutdown"; default: return "Unknown"; } diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h index 2e7a1e032c71..3f82b5cb2d82 100644 --- a/include/linux/kmsg_dump.h +++ b/include/linux/kmsg_dump.h @@ -25,9 +25,7 @@ enum kmsg_dump_reason { KMSG_DUMP_PANIC, KMSG_DUMP_OOPS, KMSG_DUMP_EMERG, - KMSG_DUMP_RESTART, - KMSG_DUMP_HALT, - KMSG_DUMP_POWEROFF, + KMSG_DUMP_SHUTDOWN, }; /** diff --git a/kernel/reboot.c b/kernel/reboot.c index c4d472b7f1b4..491f1347bf43 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -250,7 +250,7 @@ void kernel_restart(char *cmd) pr_emerg("Restarting system\n"); else pr_emerg("Restarting system with command '%s'\n", cmd); - kmsg_dump(KMSG_DUMP_RESTART); + kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_restart(cmd); } EXPORT_SYMBOL_GPL(kernel_restart); @@ -274,7 +274,7 @@ void kernel_halt(void) migrate_to_reboot_cpu(); syscore_shutdown(); pr_emerg("System halted\n"); - kmsg_dump(KMSG_DUMP_HALT); + kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_halt(); } EXPORT_SYMBOL_GPL(kernel_halt); @@ -292,7 +292,7 @@ void kernel_power_off(void) migrate_to_reboot_cpu(); syscore_shutdown(); pr_emerg("Power down\n"); - kmsg_dump(KMSG_DUMP_POWEROFF); + kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_power_off(); } EXPORT_SYMBOL_GPL(kernel_power_off); From b1f6f161b236d0e5a9222fb8b482e65aaff13689 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 5 May 2020 11:45:06 -0400 Subject: [PATCH 28/43] printk: honor the max_reason field in kmsg_dumper kmsg_dump() allows to dump kmesg buffer for various system events: oops, panic, reboot, etc. It provides an interface to register a callback call for clients, and in that callback interface there is a field "max_reason", but it was getting ignored when set to any "reason" higher than KMSG_DUMP_OOPS unless "always_kmsg_dump" was passed as kernel parameter. Allow clients to actually control their "max_reason", and keep the current behavior when "max_reason" is not set. Signed-off-by: Pavel Tatashin Link: https://lore.kernel.org/lkml/20200515184434.8470-3-keescook@chromium.org/ Reviewed-by: Petr Mladek Signed-off-by: Kees Cook --- include/linux/kmsg_dump.h | 1 + kernel/printk/printk.c | 15 +++++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h index 3f82b5cb2d82..9826014771ab 100644 --- a/include/linux/kmsg_dump.h +++ b/include/linux/kmsg_dump.h @@ -26,6 +26,7 @@ enum kmsg_dump_reason { KMSG_DUMP_OOPS, KMSG_DUMP_EMERG, KMSG_DUMP_SHUTDOWN, + KMSG_DUMP_MAX }; /** diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9a9b6156270b..a121c2255737 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3157,12 +3157,19 @@ void kmsg_dump(enum kmsg_dump_reason reason) struct kmsg_dumper *dumper; unsigned long flags; - if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) - return; - rcu_read_lock(); list_for_each_entry_rcu(dumper, &dump_list, list) { - if (dumper->max_reason && reason > dumper->max_reason) + enum kmsg_dump_reason max_reason = dumper->max_reason; + + /* + * If client has not provided a specific max_reason, default + * to KMSG_DUMP_OOPS, unless always_kmsg_dump was set. + */ + if (max_reason == KMSG_DUMP_UNDEF) { + max_reason = always_kmsg_dump ? KMSG_DUMP_MAX : + KMSG_DUMP_OOPS; + } + if (reason > max_reason) continue; /* initialize iterator with data about the stored records */ From fb13cb8a0482105a415e24042209d02a684255e2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 7 May 2020 19:36:22 -0700 Subject: [PATCH 29/43] printk: Introduce kmsg_dump_reason_str() The pstore subsystem already had a private version of this function. With the coming addition of the pstore/zone driver, this needs to be shared. As it really should live with printk, move it there instead. Link: https://lore.kernel.org/lkml/20200515184434.8470-4-keescook@chromium.org/ Acked-by: Petr Mladek Acked-by: Sergey Senozhatsky Reviewed-by: Pavel Tatashin Signed-off-by: Kees Cook --- fs/pstore/platform.c | 18 +----------------- include/linux/kmsg_dump.h | 7 +++++++ kernel/printk/printk.c | 17 +++++++++++++++++ 3 files changed, 25 insertions(+), 17 deletions(-) diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 90d74ebaa70a..5e6c6022deb9 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -135,22 +135,6 @@ enum pstore_type_id pstore_name_to_type(const char *name) } EXPORT_SYMBOL_GPL(pstore_name_to_type); -static const char *get_reason_str(enum kmsg_dump_reason reason) -{ - switch (reason) { - case KMSG_DUMP_PANIC: - return "Panic"; - case KMSG_DUMP_OOPS: - return "Oops"; - case KMSG_DUMP_EMERG: - return "Emergency"; - case KMSG_DUMP_SHUTDOWN: - return "Shutdown"; - default: - return "Unknown"; - } -} - static void pstore_timer_kick(void) { if (pstore_update_ms < 0) @@ -403,7 +387,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, unsigned int part = 1; int ret; - why = get_reason_str(reason); + why = kmsg_dump_reason_str(reason); if (down_trylock(&psinfo->buf_lock)) { /* Failed to acquire lock: give up if we cannot wait. */ diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h index 9826014771ab..3378bcbe585e 100644 --- a/include/linux/kmsg_dump.h +++ b/include/linux/kmsg_dump.h @@ -70,6 +70,8 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper); int kmsg_dump_register(struct kmsg_dumper *dumper); int kmsg_dump_unregister(struct kmsg_dumper *dumper); + +const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason); #else static inline void kmsg_dump(enum kmsg_dump_reason reason) { @@ -111,6 +113,11 @@ static inline int kmsg_dump_unregister(struct kmsg_dumper *dumper) { return -EINVAL; } + +static inline const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason) +{ + return "Disabled"; +} #endif #endif /* _LINUX_KMSG_DUMP_H */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a121c2255737..14ca4d05d902 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3144,6 +3144,23 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister); static bool always_kmsg_dump; module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); +const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason) +{ + switch (reason) { + case KMSG_DUMP_PANIC: + return "Panic"; + case KMSG_DUMP_OOPS: + return "Oops"; + case KMSG_DUMP_EMERG: + return "Emergency"; + case KMSG_DUMP_SHUTDOWN: + return "Shutdown"; + default: + return "Unknown"; + } +} +EXPORT_SYMBOL_GPL(kmsg_dump_reason_str); + /** * kmsg_dump - dump kernel log to kernel message dumpers. * @reason: the reason (oops, panic etc) for dumping From 3524e688b8ee50b0edc76f0e020727eb6c684dbc Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 5 May 2020 11:45:07 -0400 Subject: [PATCH 30/43] pstore/platform: Pass max_reason to kmesg dump Add a new member to struct pstore_info for passing information about kmesg dump maximum reason. This allows a finer control of what kmesg dumps are sent to pstore storage backends. Those backends that do not explicitly set this field (keeping it equal to 0), get the default behavior: store only Oopses and Panics, or everything if the printk.always_kmsg_dump boot param is set. Signed-off-by: Pavel Tatashin Link: https://lore.kernel.org/lkml/20200515184434.8470-5-keescook@chromium.org/ Co-developed-by: Kees Cook Signed-off-by: Kees Cook --- fs/pstore/platform.c | 4 +++- include/linux/pstore.h | 7 +++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 5e6c6022deb9..a9e297eefdff 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -595,8 +595,10 @@ int pstore_register(struct pstore_info *psi) pstore_get_records(0); - if (psi->flags & PSTORE_FLAGS_DMESG) + if (psi->flags & PSTORE_FLAGS_DMESG) { + pstore_dumper.max_reason = psinfo->max_reason; pstore_register_kmsg(); + } if (psi->flags & PSTORE_FLAGS_CONSOLE) pstore_register_console(); if (psi->flags & PSTORE_FLAGS_FTRACE) diff --git a/include/linux/pstore.h b/include/linux/pstore.h index f6f22b13e04f..eb93a54cff31 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -96,6 +96,12 @@ struct pstore_record { * * @read_mutex: serializes @open, @read, @close, and @erase callbacks * @flags: bitfield of frontends the backend can accept writes for + * @max_reason: Used when PSTORE_FLAGS_DMESG is set. Contains the + * kmsg_dump_reason enum value. KMSG_DUMP_UNDEF means + * "use existing kmsg_dump() filtering, based on the + * printk.always_kmsg_dump boot param" (which is either + * KMSG_DUMP_OOPS when false, or KMSG_DUMP_MAX when + * true); see printk.always_kmsg_dump for more details. * @data: backend-private pointer passed back during callbacks * * Callbacks: @@ -179,6 +185,7 @@ struct pstore_info { struct mutex read_mutex; int flags; + int max_reason; void *data; int (*open)(struct pstore_info *psi); From 791205e3ec6081a8da6f00621e3453d622dc41e7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 13 May 2020 14:35:03 -0700 Subject: [PATCH 31/43] pstore/ram: Introduce max_reason and convert dump_oops Now that pstore_register() can correctly pass max_reason to the kmesg dump facility, introduce a new "max_reason" module parameter and "max-reason" Device Tree field. The "dump_oops" module parameter and "dump-oops" Device Tree field are now considered deprecated, but are now automatically converted to their corresponding max_reason values when present, though the new max_reason setting has precedence. For struct ramoops_platform_data, the "dump_oops" member is entirely replaced by a new "max_reason" member, with the only existing user updated in place. Additionally remove the "reason" filter logic from ramoops_pstore_write(), as that is not specifically needed anymore, though technically this is a change in behavior for any ramoops users also setting the printk.always_kmsg_dump boot param, which will cause ramoops to behave as if max_reason was set to KMSG_DUMP_MAX. Co-developed-by: Pavel Tatashin Signed-off-by: Pavel Tatashin Link: https://lore.kernel.org/lkml/20200515184434.8470-6-keescook@chromium.org/ Signed-off-by: Kees Cook --- Documentation/admin-guide/ramoops.rst | 14 ++++-- drivers/platform/chrome/chromeos_pstore.c | 2 +- fs/pstore/ram.c | 58 +++++++++++++++-------- include/linux/pstore_ram.h | 2 +- 4 files changed, 51 insertions(+), 25 deletions(-) diff --git a/Documentation/admin-guide/ramoops.rst b/Documentation/admin-guide/ramoops.rst index 6dbcc5481000..a60a96218ba9 100644 --- a/Documentation/admin-guide/ramoops.rst +++ b/Documentation/admin-guide/ramoops.rst @@ -32,11 +32,17 @@ memory to be mapped strongly ordered, and atomic operations on strongly ordered memory are implementation defined, and won't work on many ARMs such as omaps. The memory area is divided into ``record_size`` chunks (also rounded down to -power of two) and each oops/panic writes a ``record_size`` chunk of +power of two) and each kmesg dump writes a ``record_size`` chunk of information. -Dumping both oopses and panics can be done by setting 1 in the ``dump_oops`` -variable while setting 0 in that variable dumps only the panics. +Limiting which kinds of kmsg dumps are stored can be controlled via +the ``max_reason`` value, as defined in include/linux/kmsg_dump.h's +``enum kmsg_dump_reason``. For example, to store both Oopses and Panics, +``max_reason`` should be set to 2 (KMSG_DUMP_OOPS), to store only Panics +``max_reason`` should be set to 1 (KMSG_DUMP_PANIC). Setting this to 0 +(KMSG_DUMP_UNDEF), means the reason filtering will be controlled by the +``printk.always_kmsg_dump`` boot param: if unset, it'll be KMSG_DUMP_OOPS, +otherwise KMSG_DUMP_MAX. The module uses a counter to record multiple dumps but the counter gets reset on restart (i.e. new dumps after the restart will overwrite old ones). @@ -90,7 +96,7 @@ Setting the ramoops parameters can be done in several different manners: .mem_address = <...>, .mem_type = <...>, .record_size = <...>, - .dump_oops = <...>, + .max_reason = <...>, .ecc = <...>, }; diff --git a/drivers/platform/chrome/chromeos_pstore.c b/drivers/platform/chrome/chromeos_pstore.c index d13770785fb5..fa51153688b4 100644 --- a/drivers/platform/chrome/chromeos_pstore.c +++ b/drivers/platform/chrome/chromeos_pstore.c @@ -57,7 +57,7 @@ static struct ramoops_platform_data chromeos_ramoops_data = { .record_size = 0x40000, .console_size = 0x20000, .ftrace_size = 0x20000, - .dump_oops = 1, + .max_reason = KMSG_DUMP_OOPS, }; static struct platform_device chromeos_ramoops = { diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 26a1f502a3ea..ca6d8a867285 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -58,10 +58,10 @@ module_param(mem_type, uint, 0400); MODULE_PARM_DESC(mem_type, "set to 1 to try to use unbuffered memory (default 0)"); -static int dump_oops = 1; -module_param(dump_oops, int, 0400); -MODULE_PARM_DESC(dump_oops, - "set to 1 to dump oopses, 0 to only dump panics (default 1)"); +static int ramoops_max_reason = -1; +module_param_named(max_reason, ramoops_max_reason, int, 0400); +MODULE_PARM_DESC(max_reason, + "maximum reason for kmsg dump (default 2: Oops and Panic) "); static int ramoops_ecc; module_param_named(ecc, ramoops_ecc, int, 0400); @@ -70,6 +70,11 @@ MODULE_PARM_DESC(ramoops_ecc, "ECC buffer size in bytes (1 is a special value, means 16 " "bytes ECC)"); +static int ramoops_dump_oops = -1; +module_param_named(dump_oops, ramoops_dump_oops, int, 0400); +MODULE_PARM_DESC(dump_oops, + "(deprecated: use max_reason instead) set to 1 to dump oopses & panics, 0 to only dump panics"); + struct ramoops_context { struct persistent_ram_zone **dprzs; /* Oops dump zones */ struct persistent_ram_zone *cprz; /* Console zone */ @@ -82,7 +87,6 @@ struct ramoops_context { size_t console_size; size_t ftrace_size; size_t pmsg_size; - int dump_oops; u32 flags; struct persistent_ram_ecc_info ecc_info; unsigned int max_dump_cnt; @@ -336,16 +340,14 @@ static int notrace ramoops_pstore_write(struct pstore_record *record) return -EINVAL; /* - * Out of the various dmesg dump types, ramoops is currently designed - * to only store crash logs, rather than storing general kernel logs. + * We could filter on record->reason here if we wanted to (which + * would duplicate what happened before the "max_reason" setting + * was added), but that would defeat the purpose of a system + * changing printk.always_kmsg_dump, so instead log everything that + * the kmsg dumper sends us, since it should be doing the filtering + * based on the combination of printk.always_kmsg_dump and our + * requested "max_reason". */ - if (record->reason != KMSG_DUMP_OOPS && - record->reason != KMSG_DUMP_PANIC) - return -EINVAL; - - /* Skip Oopes when configured to do so. */ - if (record->reason == KMSG_DUMP_OOPS && !cxt->dump_oops) - return -EINVAL; /* * Explicitly only take the first part of any new crash. @@ -647,7 +649,14 @@ static int ramoops_parse_dt(struct platform_device *pdev, pdata->mem_size = resource_size(res); pdata->mem_address = res->start; pdata->mem_type = of_property_read_bool(of_node, "unbuffered"); - pdata->dump_oops = !of_property_read_bool(of_node, "no-dump-oops"); + /* + * Setting "no-dump-oops" is deprecated and will be ignored if + * "max_reason" is also specified. + */ + if (of_property_read_bool(of_node, "no-dump-oops")) + pdata->max_reason = KMSG_DUMP_PANIC; + else + pdata->max_reason = KMSG_DUMP_OOPS; #define parse_u32(name, field, default_value) { \ ret = ramoops_parse_dt_u32(pdev, name, default_value, \ @@ -663,6 +672,7 @@ static int ramoops_parse_dt(struct platform_device *pdev, parse_u32("pmsg-size", pdata->pmsg_size, 0); parse_u32("ecc-size", pdata->ecc_info.ecc_size, 0); parse_u32("flags", pdata->flags, 0); + parse_u32("max-reason", pdata->max_reason, pdata->max_reason); #undef parse_u32 @@ -746,7 +756,6 @@ static int ramoops_probe(struct platform_device *pdev) cxt->console_size = pdata->console_size; cxt->ftrace_size = pdata->ftrace_size; cxt->pmsg_size = pdata->pmsg_size; - cxt->dump_oops = pdata->dump_oops; cxt->flags = pdata->flags; cxt->ecc_info = pdata->ecc_info; @@ -789,8 +798,10 @@ static int ramoops_probe(struct platform_device *pdev) * the single region size is how to check. */ cxt->pstore.flags = 0; - if (cxt->max_dump_cnt) + if (cxt->max_dump_cnt) { cxt->pstore.flags |= PSTORE_FLAGS_DMESG; + cxt->pstore.max_reason = pdata->max_reason; + } if (cxt->console_size) cxt->pstore.flags |= PSTORE_FLAGS_CONSOLE; if (cxt->max_ftrace_cnt) @@ -826,7 +837,7 @@ static int ramoops_probe(struct platform_device *pdev) mem_size = pdata->mem_size; mem_address = pdata->mem_address; record_size = pdata->record_size; - dump_oops = pdata->dump_oops; + ramoops_max_reason = pdata->max_reason; ramoops_console_size = pdata->console_size; ramoops_pmsg_size = pdata->pmsg_size; ramoops_ftrace_size = pdata->ftrace_size; @@ -909,7 +920,16 @@ static void __init ramoops_register_dummy(void) pdata.console_size = ramoops_console_size; pdata.ftrace_size = ramoops_ftrace_size; pdata.pmsg_size = ramoops_pmsg_size; - pdata.dump_oops = dump_oops; + /* If "max_reason" is set, its value has priority over "dump_oops". */ + if (ramoops_max_reason >= 0) + pdata.max_reason = ramoops_max_reason; + /* Otherwise, if "dump_oops" is set, parse it into "max_reason". */ + else if (ramoops_dump_oops != -1) + pdata.max_reason = ramoops_dump_oops ? KMSG_DUMP_OOPS + : KMSG_DUMP_PANIC; + /* And if neither are explicitly set, use the default. */ + else + pdata.max_reason = KMSG_DUMP_OOPS; pdata.flags = RAMOOPS_FLAG_FTRACE_PER_CPU; /* diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h index 9cb9b9067298..9f16afec7290 100644 --- a/include/linux/pstore_ram.h +++ b/include/linux/pstore_ram.h @@ -133,7 +133,7 @@ struct ramoops_platform_data { unsigned long console_size; unsigned long ftrace_size; unsigned long pmsg_size; - int dump_oops; + int max_reason; u32 flags; struct persistent_ram_ecc_info ecc_info; }; From acf12c5e58a45cb978d5de4628f767d326c3d740 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 5 May 2020 11:45:10 -0400 Subject: [PATCH 32/43] ramoops: Add "max-reason" optional field to ramoops DT node Currently, it is only possible to get kmsg dumps for panic and oops, or just panic, via "no-dump-oops". With "max-reason" it is possible to dump messages for other kmsg_dump events, for example emerg and shutdown. Signed-off-by: Pavel Tatashin Link: https://lore.kernel.org/lkml/20200515184434.8470-7-keescook@chromium.org/ Signed-off-by: Kees Cook --- .../devicetree/bindings/reserved-memory/ramoops.txt | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/reserved-memory/ramoops.txt b/Documentation/devicetree/bindings/reserved-memory/ramoops.txt index 0eba562fe5c6..b7886fea368c 100644 --- a/Documentation/devicetree/bindings/reserved-memory/ramoops.txt +++ b/Documentation/devicetree/bindings/reserved-memory/ramoops.txt @@ -30,7 +30,7 @@ Optional properties: - ecc-size: enables ECC support and specifies ECC buffer size in bytes (defaults to 0: no ECC) -- record-size: maximum size in bytes of each dump done on oops/panic +- record-size: maximum size in bytes of each kmsg dump. (defaults to 0: disabled) - console-size: size in bytes of log buffer reserved for kernel messages @@ -45,7 +45,16 @@ Optional properties: - unbuffered: if present, use unbuffered mappings to map the reserved region (defaults to buffered mappings) -- no-dump-oops: if present, only dump panics (defaults to panics and oops) +- max-reason: if present, sets maximum type of kmsg dump reasons to store + (defaults to 2: log Oopses and Panics). This can be set to INT_MAX to + store all kmsg dumps. See include/linux/kmsg_dump.h KMSG_DUMP_* for other + kmsg dump reason values. Setting this to 0 (KMSG_DUMP_UNDEF), means the + reason filtering will be controlled by the printk.always_kmsg_dump boot + param: if unset, it will be KMSG_DUMP_OOPS, otherwise KMSG_DUMP_MAX. + +- no-dump-oops: deprecated, use max_reason instead. If present, and + max_reason is not specified, it is equivalent to max_reason = 1 + (KMSG_DUMP_PANIC). - flags: if present, pass ramoops behavioral flags (defaults to 0, see include/linux/pstore_ram.h RAMOOPS_FLAG_* for flag values). From d26c3321fe18dc74517dc1f518d584aa33b0a851 Mon Sep 17 00:00:00 2001 From: WeiXiong Liao Date: Wed, 25 Mar 2020 16:54:56 +0800 Subject: [PATCH 33/43] pstore/zone: Introduce common layer to manage storage zones Implement a common set of APIs needed to support pstore storage zones, based on how ramoops is designed. This will be used by pstore/blk with the intention of migrating pstore/ram in the future. Signed-off-by: WeiXiong Liao Link: https://lore.kernel.org/lkml/20200511233229.27745-2-keescook@chromium.org/ Co-developed-by: Kees Cook Signed-off-by: Kees Cook --- fs/pstore/Kconfig | 7 + fs/pstore/Makefile | 3 + fs/pstore/zone.c | 985 ++++++++++++++++++++++++++++++++++++ include/linux/pstore_zone.h | 44 ++ 4 files changed, 1039 insertions(+) create mode 100644 fs/pstore/zone.c create mode 100644 include/linux/pstore_zone.h diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index 8f0369aad22a..98d2457bdd9f 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -153,3 +153,10 @@ config PSTORE_RAM "ramoops.ko". For more information, see Documentation/admin-guide/ramoops.rst. + +config PSTORE_ZONE + tristate + depends on PSTORE + help + The common layer for pstore/blk (and pstore/ram in the future) + to manage storage in zones. diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile index 967b5891f325..58a967cbe4af 100644 --- a/fs/pstore/Makefile +++ b/fs/pstore/Makefile @@ -12,3 +12,6 @@ pstore-$(CONFIG_PSTORE_PMSG) += pmsg.o ramoops-objs += ram.o ram_core.o obj-$(CONFIG_PSTORE_RAM) += ramoops.o + +pstore_zone-objs += zone.o +obj-$(CONFIG_PSTORE_ZONE) += pstore_zone.o diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c new file mode 100644 index 000000000000..362d72a24012 --- /dev/null +++ b/fs/pstore/zone.c @@ -0,0 +1,985 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Provide a pstore intermediate backend, organized into kernel memory + * allocated zones that are then mapped and flushed into a single + * contiguous region on a storage backend of some kind (block, mtd, etc). + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/** + * struct psz_head - header of zone to flush to storage + * + * @sig: signature to indicate header (PSZ_SIG xor PSZONE-type value) + * @datalen: length of data in @data + * @data: zone data. + */ +struct psz_buffer { +#define PSZ_SIG (0x43474244) /* DBGC */ + uint32_t sig; + atomic_t datalen; + uint8_t data[]; +}; + +/** + * struct psz_kmsg_header - kmsg dump-specific header to flush to storage + * + * @magic: magic num for kmsg dump header + * @time: kmsg dump trigger time + * @compressed: whether conpressed + * @counter: kmsg dump counter + * @reason: the kmsg dump reason (e.g. oops, panic, etc) + * @data: pointer to log data + * + * This is a sub-header for a kmsg dump, trailing after &psz_buffer. + */ +struct psz_kmsg_header { +#define PSTORE_KMSG_HEADER_MAGIC 0x4dfc3ae5 /* Just a random number */ + uint32_t magic; + struct timespec64 time; + bool compressed; + uint32_t counter; + enum kmsg_dump_reason reason; + uint8_t data[]; +}; + +/** + * struct pstore_zone - single stored buffer + * + * @off: zone offset of storage + * @type: front-end type for this zone + * @name: front-end name for this zone + * @buffer: pointer to data buffer managed by this zone + * @oldbuf: pointer to old data buffer + * @buffer_size: bytes in @buffer->data + * @should_recover: whether this zone should recover from storage + * @dirty: whether the data in @buffer dirty + * + * zone structure in memory. + */ +struct pstore_zone { + loff_t off; + const char *name; + enum pstore_type_id type; + + struct psz_buffer *buffer; + struct psz_buffer *oldbuf; + size_t buffer_size; + bool should_recover; + atomic_t dirty; +}; + +/** + * struct psz_context - all about running state of pstore/zone + * + * @kpszs: kmsg dump storage zones + * @kmsg_max_cnt: max count of @kpszs + * @kmsg_read_cnt: counter of total read kmsg dumps + * @kmsg_write_cnt: counter of total kmsg dump writes + * @oops_counter: counter of oops dumps + * @panic_counter: counter of panic dumps + * @recovered: whether finished recovering data from storage + * @on_panic: whether panic is happening + * @pstore_zone_info_lock: lock to @pstore_zone_info + * @pstore_zone_info: information from backend + * @pstore: structure for pstore + */ +struct psz_context { + struct pstore_zone **kpszs; + unsigned int kmsg_max_cnt; + unsigned int kmsg_read_cnt; + unsigned int kmsg_write_cnt; + /* + * These counters should be calculated during recovery. + * It records the oops/panic times after crashes rather than boots. + */ + unsigned int oops_counter; + unsigned int panic_counter; + atomic_t recovered; + atomic_t on_panic; + + /* + * pstore_zone_info_lock protects this entire structure during calls + * to register_pstore_zone()/unregister_pstore_zone(). + */ + struct mutex pstore_zone_info_lock; + struct pstore_zone_info *pstore_zone_info; + struct pstore_info pstore; +}; +static struct psz_context pstore_zone_cxt; + +/** + * enum psz_flush_mode - flush mode for psz_zone_write() + * + * @FLUSH_NONE: do not flush to storage but update data on memory + * @FLUSH_PART: just flush part of data including meta data to storage + * @FLUSH_META: just flush meta data of zone to storage + * @FLUSH_ALL: flush all of zone + */ +enum psz_flush_mode { + FLUSH_NONE = 0, + FLUSH_PART, + FLUSH_META, + FLUSH_ALL, +}; + +static inline int buffer_datalen(struct pstore_zone *zone) +{ + return atomic_read(&zone->buffer->datalen); +} + +static inline bool is_on_panic(void) +{ + return atomic_read(&pstore_zone_cxt.on_panic); +} + +static ssize_t psz_zone_read(struct pstore_zone *zone, char *buf, + size_t len, unsigned long off) +{ + if (!buf || !zone->buffer) + return -EINVAL; + if (off > zone->buffer_size) + return -EINVAL; + len = min_t(size_t, len, zone->buffer_size - off); + memcpy(buf, zone->buffer->data + off, len); + return len; +} + +static int psz_zone_write(struct pstore_zone *zone, + enum psz_flush_mode flush_mode, const char *buf, + size_t len, unsigned long off) +{ + struct pstore_zone_info *info = pstore_zone_cxt.pstore_zone_info; + ssize_t wcnt = 0; + ssize_t (*writeop)(const char *buf, size_t bytes, loff_t pos); + size_t wlen; + + if (off > zone->buffer_size) + return -EINVAL; + + wlen = min_t(size_t, len, zone->buffer_size - off); + if (buf && wlen) { + memcpy(zone->buffer->data + off, buf, wlen); + atomic_set(&zone->buffer->datalen, wlen + off); + } + + /* avoid to damage old records */ + if (!is_on_panic() && !atomic_read(&pstore_zone_cxt.recovered)) + goto dirty; + + writeop = is_on_panic() ? info->panic_write : info->write; + if (!writeop) + goto dirty; + + switch (flush_mode) { + case FLUSH_NONE: + if (unlikely(buf && wlen)) + goto dirty; + return 0; + case FLUSH_PART: + wcnt = writeop((const char *)zone->buffer->data + off, wlen, + zone->off + sizeof(*zone->buffer) + off); + if (wcnt != wlen) + goto dirty; + fallthrough; + case FLUSH_META: + wlen = sizeof(struct psz_buffer); + wcnt = writeop((const char *)zone->buffer, wlen, zone->off); + if (wcnt != wlen) + goto dirty; + break; + case FLUSH_ALL: + wlen = zone->buffer_size + sizeof(*zone->buffer); + wcnt = writeop((const char *)zone->buffer, wlen, zone->off); + if (wcnt != wlen) + goto dirty; + break; + } + + return 0; +dirty: + atomic_set(&zone->dirty, true); + return -EBUSY; +} + +static int psz_flush_dirty_zone(struct pstore_zone *zone) +{ + int ret; + + if (unlikely(!zone)) + return -EINVAL; + + if (unlikely(!atomic_read(&pstore_zone_cxt.recovered))) + return -EBUSY; + + if (!atomic_xchg(&zone->dirty, false)) + return 0; + + ret = psz_zone_write(zone, FLUSH_ALL, NULL, 0, 0); + if (ret) + atomic_set(&zone->dirty, true); + return ret; +} + +static int psz_flush_dirty_zones(struct pstore_zone **zones, unsigned int cnt) +{ + int i, ret; + struct pstore_zone *zone; + + if (!zones) + return -EINVAL; + + for (i = 0; i < cnt; i++) { + zone = zones[i]; + if (!zone) + return -EINVAL; + ret = psz_flush_dirty_zone(zone); + if (ret) + return ret; + } + return 0; +} + +static int psz_move_zone(struct pstore_zone *old, struct pstore_zone *new) +{ + const char *data = (const char *)old->buffer->data; + int ret; + + ret = psz_zone_write(new, FLUSH_ALL, data, buffer_datalen(old), 0); + if (ret) { + atomic_set(&new->buffer->datalen, 0); + atomic_set(&new->dirty, false); + return ret; + } + atomic_set(&old->buffer->datalen, 0); + return 0; +} + +static int psz_kmsg_recover_data(struct psz_context *cxt) +{ + struct pstore_zone_info *info = cxt->pstore_zone_info; + struct pstore_zone *zone = NULL; + struct psz_buffer *buf; + unsigned long i; + ssize_t rcnt; + + if (!info->read) + return -EINVAL; + + for (i = 0; i < cxt->kmsg_max_cnt; i++) { + zone = cxt->kpszs[i]; + if (unlikely(!zone)) + return -EINVAL; + if (atomic_read(&zone->dirty)) { + unsigned int wcnt = cxt->kmsg_write_cnt; + struct pstore_zone *new = cxt->kpszs[wcnt]; + int ret; + + ret = psz_move_zone(zone, new); + if (ret) { + pr_err("move zone from %lu to %d failed\n", + i, wcnt); + return ret; + } + cxt->kmsg_write_cnt = (wcnt + 1) % cxt->kmsg_max_cnt; + } + if (!zone->should_recover) + continue; + buf = zone->buffer; + rcnt = info->read((char *)buf, zone->buffer_size + sizeof(*buf), + zone->off); + if (rcnt != zone->buffer_size + sizeof(*buf)) + return (int)rcnt < 0 ? (int)rcnt : -EIO; + } + return 0; +} + +static int psz_kmsg_recover_meta(struct psz_context *cxt) +{ + struct pstore_zone_info *info = cxt->pstore_zone_info; + struct pstore_zone *zone; + size_t rcnt, len; + struct psz_buffer *buf; + struct psz_kmsg_header *hdr; + struct timespec64 time = { }; + unsigned long i; + /* + * Recover may on panic, we can't allocate any memory by kmalloc. + * So, we use local array instead. + */ + char buffer_header[sizeof(*buf) + sizeof(*hdr)] = {0}; + + if (!info->read) + return -EINVAL; + + len = sizeof(*buf) + sizeof(*hdr); + buf = (struct psz_buffer *)buffer_header; + for (i = 0; i < cxt->kmsg_max_cnt; i++) { + zone = cxt->kpszs[i]; + if (unlikely(!zone)) + return -EINVAL; + + rcnt = info->read((char *)buf, len, zone->off); + if (rcnt != len) { + pr_err("read %s with id %lu failed\n", zone->name, i); + return (int)rcnt < 0 ? (int)rcnt : -EIO; + } + + if (buf->sig != zone->buffer->sig) { + pr_debug("no valid data in kmsg dump zone %lu\n", i); + continue; + } + + if (zone->buffer_size < atomic_read(&buf->datalen)) { + pr_info("found overtop zone: %s: id %lu, off %lld, size %zu\n", + zone->name, i, zone->off, + zone->buffer_size); + continue; + } + + hdr = (struct psz_kmsg_header *)buf->data; + if (hdr->magic != PSTORE_KMSG_HEADER_MAGIC) { + pr_info("found invalid zone: %s: id %lu, off %lld, size %zu\n", + zone->name, i, zone->off, + zone->buffer_size); + continue; + } + + /* + * we get the newest zone, and the next one must be the oldest + * or unused zone, because we do write one by one like a circle. + */ + if (hdr->time.tv_sec >= time.tv_sec) { + time.tv_sec = hdr->time.tv_sec; + cxt->kmsg_write_cnt = (i + 1) % cxt->kmsg_max_cnt; + } + + if (hdr->reason == KMSG_DUMP_OOPS) + cxt->oops_counter = + max(cxt->oops_counter, hdr->counter); + else if (hdr->reason == KMSG_DUMP_PANIC) + cxt->panic_counter = + max(cxt->panic_counter, hdr->counter); + + if (!atomic_read(&buf->datalen)) { + pr_debug("found erased zone: %s: id %lu, off %lld, size %zu, datalen %d\n", + zone->name, i, zone->off, + zone->buffer_size, + atomic_read(&buf->datalen)); + continue; + } + + if (!is_on_panic()) + zone->should_recover = true; + pr_debug("found nice zone: %s: id %lu, off %lld, size %zu, datalen %d\n", + zone->name, i, zone->off, + zone->buffer_size, atomic_read(&buf->datalen)); + } + + return 0; +} + +static int psz_kmsg_recover(struct psz_context *cxt) +{ + int ret; + + if (!cxt->kpszs) + return 0; + + ret = psz_kmsg_recover_meta(cxt); + if (ret) + goto recover_fail; + + ret = psz_kmsg_recover_data(cxt); + if (ret) + goto recover_fail; + + return 0; +recover_fail: + pr_debug("psz_recover_kmsg failed\n"); + return ret; +} + +/** + * psz_recovery() - recover data from storage + * @cxt: the context of pstore/zone + * + * recovery means reading data back from storage after rebooting + * + * Return: 0 on success, others on failure. + */ +static inline int psz_recovery(struct psz_context *cxt) +{ + int ret; + + if (atomic_read(&cxt->recovered)) + return 0; + + ret = psz_kmsg_recover(cxt); + + if (unlikely(ret)) + pr_err("recover failed\n"); + else { + pr_debug("recover end!\n"); + atomic_set(&cxt->recovered, 1); + } + return ret; +} + +static int psz_pstore_open(struct pstore_info *psi) +{ + struct psz_context *cxt = psi->data; + + cxt->kmsg_read_cnt = 0; + return 0; +} + +static inline bool psz_ok(struct pstore_zone *zone) +{ + if (zone && zone->buffer && buffer_datalen(zone)) + return true; + return false; +} + +static inline int psz_kmsg_erase(struct psz_context *cxt, + struct pstore_zone *zone, struct pstore_record *record) +{ + struct psz_buffer *buffer = zone->buffer; + struct psz_kmsg_header *hdr = + (struct psz_kmsg_header *)buffer->data; + + if (unlikely(!psz_ok(zone))) + return 0; + /* this zone is already updated, no need to erase */ + if (record->count != hdr->counter) + return 0; + + atomic_set(&zone->buffer->datalen, 0); + return psz_zone_write(zone, FLUSH_META, NULL, 0, 0); +} + +static int psz_pstore_erase(struct pstore_record *record) +{ + struct psz_context *cxt = record->psi->data; + + switch (record->type) { + case PSTORE_TYPE_DMESG: + if (record->id >= cxt->kmsg_max_cnt) + return -EINVAL; + return psz_kmsg_erase(cxt, cxt->kpszs[record->id], record); + default: + return -EINVAL; + } +} + +static void psz_write_kmsg_hdr(struct pstore_zone *zone, + struct pstore_record *record) +{ + struct psz_context *cxt = record->psi->data; + struct psz_buffer *buffer = zone->buffer; + struct psz_kmsg_header *hdr = + (struct psz_kmsg_header *)buffer->data; + + hdr->magic = PSTORE_KMSG_HEADER_MAGIC; + hdr->compressed = record->compressed; + hdr->time.tv_sec = record->time.tv_sec; + hdr->time.tv_nsec = record->time.tv_nsec; + hdr->reason = record->reason; + if (hdr->reason == KMSG_DUMP_OOPS) + hdr->counter = ++cxt->oops_counter; + else if (hdr->reason == KMSG_DUMP_PANIC) + hdr->counter = ++cxt->panic_counter; + else + hdr->counter = 0; +} + +static inline int notrace psz_kmsg_write_record(struct psz_context *cxt, + struct pstore_record *record) +{ + size_t size, hlen; + struct pstore_zone *zone; + unsigned int zonenum; + + zonenum = cxt->kmsg_write_cnt; + zone = cxt->kpszs[zonenum]; + if (unlikely(!zone)) + return -ENOSPC; + cxt->kmsg_write_cnt = (zonenum + 1) % cxt->kmsg_max_cnt; + + pr_debug("write %s to zone id %d\n", zone->name, zonenum); + psz_write_kmsg_hdr(zone, record); + hlen = sizeof(struct psz_kmsg_header); + size = min_t(size_t, record->size, zone->buffer_size - hlen); + return psz_zone_write(zone, FLUSH_ALL, record->buf, size, hlen); +} + +static int notrace psz_kmsg_write(struct psz_context *cxt, + struct pstore_record *record) +{ + int ret; + + /* + * Explicitly only take the first part of any new crash. + * If our buffer is larger than kmsg_bytes, this can never happen, + * and if our buffer is smaller than kmsg_bytes, we don't want the + * report split across multiple records. + */ + if (record->part != 1) + return -ENOSPC; + + if (!cxt->kpszs) + return -ENOSPC; + + ret = psz_kmsg_write_record(cxt, record); + if (!ret) { + pr_debug("try to flush other dirty zones\n"); + psz_flush_dirty_zones(cxt->kpszs, cxt->kmsg_max_cnt); + } + + /* always return 0 as we had handled it on buffer */ + return 0; +} + +static int notrace psz_pstore_write(struct pstore_record *record) +{ + struct psz_context *cxt = record->psi->data; + + if (record->type == PSTORE_TYPE_DMESG && + record->reason == KMSG_DUMP_PANIC) + atomic_set(&cxt->on_panic, 1); + + switch (record->type) { + case PSTORE_TYPE_DMESG: + return psz_kmsg_write(cxt, record); + default: + return -EINVAL; + } +} + +static struct pstore_zone *psz_read_next_zone(struct psz_context *cxt) +{ + struct pstore_zone *zone = NULL; + + while (cxt->kmsg_read_cnt < cxt->kmsg_max_cnt) { + zone = cxt->kpszs[cxt->kmsg_read_cnt++]; + if (psz_ok(zone)) + return zone; + } + + return NULL; +} + +static int psz_kmsg_read_hdr(struct pstore_zone *zone, + struct pstore_record *record) +{ + struct psz_buffer *buffer = zone->buffer; + struct psz_kmsg_header *hdr = + (struct psz_kmsg_header *)buffer->data; + + if (hdr->magic != PSTORE_KMSG_HEADER_MAGIC) + return -EINVAL; + record->compressed = hdr->compressed; + record->time.tv_sec = hdr->time.tv_sec; + record->time.tv_nsec = hdr->time.tv_nsec; + record->reason = hdr->reason; + record->count = hdr->counter; + return 0; +} + +static ssize_t psz_kmsg_read(struct pstore_zone *zone, + struct pstore_record *record) +{ + ssize_t size, hlen = 0; + + size = buffer_datalen(zone); + /* Clear and skip this kmsg dump record if it has no valid header */ + if (psz_kmsg_read_hdr(zone, record)) { + atomic_set(&zone->buffer->datalen, 0); + atomic_set(&zone->dirty, 0); + return -ENOMSG; + } + size -= sizeof(struct psz_kmsg_header); + + if (!record->compressed) { + char *buf = kasprintf(GFP_KERNEL, "%s: Total %d times\n", + kmsg_dump_reason_str(record->reason), + record->count); + hlen = strlen(buf); + record->buf = krealloc(buf, hlen + size, GFP_KERNEL); + if (!record->buf) { + kfree(buf); + return -ENOMEM; + } + } else { + record->buf = kmalloc(size, GFP_KERNEL); + if (!record->buf) + return -ENOMEM; + } + + size = psz_zone_read(zone, record->buf + hlen, size, + sizeof(struct psz_kmsg_header)); + if (unlikely(size < 0)) { + kfree(record->buf); + return -ENOMSG; + } + + return size + hlen; +} + +static ssize_t psz_pstore_read(struct pstore_record *record) +{ + struct psz_context *cxt = record->psi->data; + ssize_t (*readop)(struct pstore_zone *zone, + struct pstore_record *record); + struct pstore_zone *zone; + ssize_t ret; + + /* before read, we must recover from storage */ + ret = psz_recovery(cxt); + if (ret) + return ret; + +next_zone: + zone = psz_read_next_zone(cxt); + if (!zone) + return 0; + + record->type = zone->type; + switch (record->type) { + case PSTORE_TYPE_DMESG: + readop = psz_kmsg_read; + record->id = cxt->kmsg_read_cnt - 1; + break; + default: + goto next_zone; + } + + ret = readop(zone, record); + if (ret == -ENOMSG) + goto next_zone; + return ret; +} + +static struct psz_context pstore_zone_cxt = { + .pstore_zone_info_lock = + __MUTEX_INITIALIZER(pstore_zone_cxt.pstore_zone_info_lock), + .recovered = ATOMIC_INIT(0), + .on_panic = ATOMIC_INIT(0), + .pstore = { + .owner = THIS_MODULE, + .open = psz_pstore_open, + .read = psz_pstore_read, + .write = psz_pstore_write, + .erase = psz_pstore_erase, + }, +}; + +static void psz_free_zone(struct pstore_zone **pszone) +{ + struct pstore_zone *zone = *pszone; + + if (!zone) + return; + + kfree(zone->buffer); + kfree(zone); + *pszone = NULL; +} + +static void psz_free_zones(struct pstore_zone ***pszones, unsigned int *cnt) +{ + struct pstore_zone **zones = *pszones; + + if (!zones) + return; + + while (*cnt > 0) { + (*cnt)--; + psz_free_zone(&(zones[*cnt])); + } + kfree(zones); + *pszones = NULL; +} + +static void psz_free_all_zones(struct psz_context *cxt) +{ + if (cxt->kpszs) + psz_free_zones(&cxt->kpszs, &cxt->kmsg_max_cnt); +} + +static struct pstore_zone *psz_init_zone(enum pstore_type_id type, + loff_t *off, size_t size) +{ + struct pstore_zone_info *info = pstore_zone_cxt.pstore_zone_info; + struct pstore_zone *zone; + const char *name = pstore_type_to_name(type); + + if (!size) + return NULL; + + if (*off + size > info->total_size) { + pr_err("no room for %s (0x%zx@0x%llx over 0x%lx)\n", + name, size, *off, info->total_size); + return ERR_PTR(-ENOMEM); + } + + zone = kzalloc(sizeof(struct pstore_zone), GFP_KERNEL); + if (!zone) + return ERR_PTR(-ENOMEM); + + zone->buffer = kmalloc(size, GFP_KERNEL); + if (!zone->buffer) { + kfree(zone); + return ERR_PTR(-ENOMEM); + } + memset(zone->buffer, 0xFF, size); + zone->off = *off; + zone->name = name; + zone->type = type; + zone->buffer_size = size - sizeof(struct psz_buffer); + zone->buffer->sig = type ^ PSZ_SIG; + atomic_set(&zone->dirty, 0); + atomic_set(&zone->buffer->datalen, 0); + + *off += size; + + pr_debug("pszone %s: off 0x%llx, %zu header, %zu data\n", zone->name, + zone->off, sizeof(*zone->buffer), zone->buffer_size); + return zone; +} + +static struct pstore_zone **psz_init_zones(enum pstore_type_id type, + loff_t *off, size_t total_size, ssize_t record_size, + unsigned int *cnt) +{ + struct pstore_zone_info *info = pstore_zone_cxt.pstore_zone_info; + struct pstore_zone **zones, *zone; + const char *name = pstore_type_to_name(type); + int c, i; + + *cnt = 0; + if (!total_size || !record_size) + return NULL; + + if (*off + total_size > info->total_size) { + pr_err("no room for zones %s (0x%zx@0x%llx over 0x%lx)\n", + name, total_size, *off, info->total_size); + return ERR_PTR(-ENOMEM); + } + + c = total_size / record_size; + zones = kcalloc(c, sizeof(*zones), GFP_KERNEL); + if (!zones) { + pr_err("allocate for zones %s failed\n", name); + return ERR_PTR(-ENOMEM); + } + memset(zones, 0, c * sizeof(*zones)); + + for (i = 0; i < c; i++) { + zone = psz_init_zone(type, off, record_size); + if (!zone || IS_ERR(zone)) { + pr_err("initialize zones %s failed\n", name); + psz_free_zones(&zones, &i); + return (void *)zone; + } + zones[i] = zone; + } + + *cnt = c; + return zones; +} + +static int psz_alloc_zones(struct psz_context *cxt) +{ + struct pstore_zone_info *info = cxt->pstore_zone_info; + loff_t off = 0; + int err; + size_t size; + + size = info->total_size; + cxt->kpszs = psz_init_zones(PSTORE_TYPE_DMESG, &off, size, + info->kmsg_size, &cxt->kmsg_max_cnt); + if (IS_ERR(cxt->kpszs)) { + err = PTR_ERR(cxt->kpszs); + cxt->kpszs = NULL; + goto fail_out; + } + + return 0; +fail_out: + return err; +} + +/** + * register_pstore_zone() - register to pstore/zone + * + * @info: back-end driver information. See &struct pstore_zone_info. + * + * Only one back-end at one time. + * + * Return: 0 on success, others on failure. + */ +int register_pstore_zone(struct pstore_zone_info *info) +{ + int err = -EINVAL; + struct psz_context *cxt = &pstore_zone_cxt; + + if (info->total_size < 4096) { + pr_warn("total_size must be >= 4096\n"); + return -EINVAL; + } + + if (!info->kmsg_size) { + pr_warn("at least one record size must be non-zero\n"); + return -EINVAL; + } + + if (!info->name || !info->name[0]) + return -EINVAL; + +#define check_size(name, size) { \ + if (info->name > 0 && info->name < (size)) { \ + pr_err(#name " must be over %d\n", (size)); \ + return -EINVAL; \ + } \ + if (info->name & (size - 1)) { \ + pr_err(#name " must be a multiple of %d\n", \ + (size)); \ + return -EINVAL; \ + } \ + } + + check_size(total_size, 4096); + check_size(kmsg_size, SECTOR_SIZE); + +#undef check_size + + /* + * the @read and @write must be applied. + * if no @read, pstore may mount failed. + * if no @write, pstore do not support to remove record file. + */ + if (!info->read || !info->write) { + pr_err("no valid general read/write interface\n"); + return -EINVAL; + } + + mutex_lock(&cxt->pstore_zone_info_lock); + if (cxt->pstore_zone_info) { + pr_warn("'%s' already loaded: ignoring '%s'\n", + cxt->pstore_zone_info->name, info->name); + mutex_unlock(&cxt->pstore_zone_info_lock); + return -EBUSY; + } + cxt->pstore_zone_info = info; + + pr_debug("register %s with properties:\n", info->name); + pr_debug("\ttotal size : %ld Bytes\n", info->total_size); + pr_debug("\tkmsg size : %ld Bytes\n", info->kmsg_size); + + err = psz_alloc_zones(cxt); + if (err) { + pr_err("alloc zones failed\n"); + goto fail_out; + } + + if (info->kmsg_size) { + cxt->pstore.bufsize = cxt->kpszs[0]->buffer_size - + sizeof(struct psz_kmsg_header); + cxt->pstore.buf = kzalloc(cxt->pstore.bufsize, GFP_KERNEL); + if (!cxt->pstore.buf) { + err = -ENOMEM; + goto fail_free; + } + } + cxt->pstore.data = cxt; + + pr_info("registered %s as backend for", info->name); + cxt->pstore.max_reason = info->max_reason; + cxt->pstore.name = info->name; + if (info->kmsg_size) { + cxt->pstore.flags |= PSTORE_FLAGS_DMESG; + pr_cont(" kmsg(%s", + kmsg_dump_reason_str(cxt->pstore.max_reason)); + if (cxt->pstore_zone_info->panic_write) + pr_cont(",panic_write"); + pr_cont(")"); + } + pr_cont("\n"); + + err = pstore_register(&cxt->pstore); + if (err) { + pr_err("registering with pstore failed\n"); + goto fail_free; + } + mutex_unlock(&pstore_zone_cxt.pstore_zone_info_lock); + + return 0; + +fail_free: + kfree(cxt->pstore.buf); + cxt->pstore.buf = NULL; + cxt->pstore.bufsize = 0; + psz_free_all_zones(cxt); +fail_out: + pstore_zone_cxt.pstore_zone_info = NULL; + mutex_unlock(&pstore_zone_cxt.pstore_zone_info_lock); + return err; +} +EXPORT_SYMBOL_GPL(register_pstore_zone); + +/** + * unregister_pstore_zone() - unregister to pstore/zone + * + * @info: back-end driver information. See struct pstore_zone_info. + */ +void unregister_pstore_zone(struct pstore_zone_info *info) +{ + struct psz_context *cxt = &pstore_zone_cxt; + + mutex_lock(&cxt->pstore_zone_info_lock); + if (!cxt->pstore_zone_info) { + mutex_unlock(&cxt->pstore_zone_info_lock); + return; + } + + /* Stop incoming writes from pstore. */ + pstore_unregister(&cxt->pstore); + + /* Clean up allocations. */ + kfree(cxt->pstore.buf); + cxt->pstore.buf = NULL; + cxt->pstore.bufsize = 0; + cxt->pstore_zone_info = NULL; + + psz_free_all_zones(cxt); + + /* Clear counters and zone state. */ + cxt->oops_counter = 0; + cxt->panic_counter = 0; + atomic_set(&cxt->recovered, 0); + atomic_set(&cxt->on_panic, 0); + + mutex_unlock(&cxt->pstore_zone_info_lock); +} +EXPORT_SYMBOL_GPL(unregister_pstore_zone); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("WeiXiong Liao "); +MODULE_AUTHOR("Kees Cook "); +MODULE_DESCRIPTION("Storage Manager for pstore/blk"); diff --git a/include/linux/pstore_zone.h b/include/linux/pstore_zone.h new file mode 100644 index 000000000000..eb005d9ae40c --- /dev/null +++ b/include/linux/pstore_zone.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __PSTORE_ZONE_H_ +#define __PSTORE_ZONE_H_ + +#include + +typedef ssize_t (*pstore_zone_read_op)(char *, size_t, loff_t); +typedef ssize_t (*pstore_zone_write_op)(const char *, size_t, loff_t); +/** + * struct pstore_zone_info - pstore/zone back-end driver structure + * + * @owner: Module which is responsible for this back-end driver. + * @name: Name of the back-end driver. + * @total_size: The total size in bytes pstore/zone can use. It must be greater + * than 4096 and be multiple of 4096. + * @kmsg_size: The size of oops/panic zone. Zero means disabled, otherwise, + * it must be multiple of SECTOR_SIZE(512 Bytes). + * @max_reason: Maximum kmsg dump reason to store. + * @read: The general read operation. Both of the function parameters + * @size and @offset are relative value to storage. + * On success, the number of bytes should be returned, others + * means error. + * @write: The same as @read. + * @panic_write:The write operation only used for panic case. It's optional + * if you do not care panic log. The parameters and return value + * are the same as @read. + */ +struct pstore_zone_info { + struct module *owner; + const char *name; + + unsigned long total_size; + unsigned long kmsg_size; + int max_reason; + pstore_zone_read_op read; + pstore_zone_write_op write; + pstore_zone_write_op panic_write; +}; + +extern int register_pstore_zone(struct pstore_zone_info *info); +extern void unregister_pstore_zone(struct pstore_zone_info *info); + +#endif From 17639f67c1d61aba3c05e7703f75cd468f9d484f Mon Sep 17 00:00:00 2001 From: WeiXiong Liao Date: Wed, 25 Mar 2020 16:54:57 +0800 Subject: [PATCH 34/43] pstore/blk: Introduce backend for block devices pstore/blk is similar to pstore/ram, but uses a block device as the storage rather than persistent ram. The pstore/blk backend solves two common use-cases that used to preclude using pstore/ram: - not all devices have a battery that could be used to persist regular RAM across power failures. - most embedded intelligent equipment have no persistent ram, which increases costs, instead preferring cheaper solutions, like block devices. pstore/blk provides separate configurations for the end user and for the block drivers. User configuration determines how pstore/blk operates, such as record sizes, max kmsg dump reasons, etc. These can be set by Kconfig and/or module parameters, but module parameter have priority over Kconfig. Driver configuration covers all the details about the target block device, such as total size of the device and how to perform read/write operations. These are provided by block drivers, calling pstore_register_blkdev(), including an optional panic_write callback used to bypass regular IO APIs in an effort to avoid potentially destabilized kernel code during a panic. Signed-off-by: WeiXiong Liao Link: https://lore.kernel.org/lkml/20200511233229.27745-3-keescook@chromium.org/ Co-developed-by: Kees Cook Signed-off-by: Kees Cook --- fs/pstore/Kconfig | 64 ++++++ fs/pstore/Makefile | 3 + fs/pstore/blk.c | 434 +++++++++++++++++++++++++++++++++++++ include/linux/pstore_blk.h | 51 +++++ 4 files changed, 552 insertions(+) create mode 100644 fs/pstore/blk.c create mode 100644 include/linux/pstore_blk.h diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index 98d2457bdd9f..958bec75f907 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -160,3 +160,67 @@ config PSTORE_ZONE help The common layer for pstore/blk (and pstore/ram in the future) to manage storage in zones. + +config PSTORE_BLK + tristate "Log panic/oops to a block device" + depends on PSTORE + depends on BLOCK + select PSTORE_ZONE + default n + help + This enables panic and oops message to be logged to a block dev + where it can be read back at some later point. + + If unsure, say N. + +config PSTORE_BLK_BLKDEV + string "block device identifier" + depends on PSTORE_BLK + default "" + help + Which block device should be used for pstore/blk. + + It accept the following variants: + 1) device number in hexadecimal representation, + with no leading 0x, for example b302. + 2) /dev/ represents the device number of disk + 3) /dev/ represents the device number + of partition - device number of disk plus the partition number + 4) /dev/p - same as the above, this form is + used when disk name of partitioned disk ends with a digit. + 5) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the + unique id of a partition if the partition table provides it. + The UUID may be either an EFI/GPT UUID, or refer to an MSDOS + partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero- + filled hex representation of the 32-bit "NT disk signature", and PP + is a zero-filled hex representation of the 1-based partition number. + 6) PARTUUID=/PARTNROFF= to select a partition in relation + to a partition with a known unique id. + 7) : major and minor number of the device separated by + a colon. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. + +config PSTORE_BLK_KMSG_SIZE + int "Size in Kbytes of kmsg dump log to store" + depends on PSTORE_BLK + default 64 + help + This just sets size of kmsg dump (oops, panic, etc) log for + pstore/blk. The size is in KB and must be a multiple of 4. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. + +config PSTORE_BLK_MAX_REASON + int "Maximum kmsg dump reason to store" + depends on PSTORE_BLK + default 2 + help + The maximum reason for kmsg dumps to store. The default is + 2 (KMSG_DUMP_OOPS), see include/linux/kmsg_dump.h's + enum kmsg_dump_reason for more details. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile index 58a967cbe4af..c270467aeece 100644 --- a/fs/pstore/Makefile +++ b/fs/pstore/Makefile @@ -15,3 +15,6 @@ obj-$(CONFIG_PSTORE_RAM) += ramoops.o pstore_zone-objs += zone.o obj-$(CONFIG_PSTORE_ZONE) += pstore_zone.o + +pstore_blk-objs += blk.o +obj-$(CONFIG_PSTORE_BLK) += pstore_blk.o diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c new file mode 100644 index 000000000000..8f131c6e412e --- /dev/null +++ b/fs/pstore/blk.c @@ -0,0 +1,434 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implements pstore backend driver that write to block (or non-block) storage + * devices, using the pstore/zone API. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include "../../block/blk.h" +#include +#include +#include +#include +#include +#include +#include +#include + +static long kmsg_size = CONFIG_PSTORE_BLK_KMSG_SIZE; +module_param(kmsg_size, long, 0400); +MODULE_PARM_DESC(kmsg_size, "kmsg dump record size in kbytes"); + +static int max_reason = CONFIG_PSTORE_BLK_MAX_REASON; +module_param(max_reason, int, 0400); +MODULE_PARM_DESC(max_reason, + "maximum reason for kmsg dump (default 2: Oops and Panic)"); + +/* + * blkdev - the block device to use for pstore storage + * + * Usually, this will be a partition of a block device. + * + * blkdev accepts the following variants: + * 1) device number in hexadecimal representation, + * with no leading 0x, for example b302. + * 2) /dev/ represents the device number of disk + * 3) /dev/ represents the device number + * of partition - device number of disk plus the partition number + * 4) /dev/p - same as the above, that form is + * used when disk name of partitioned disk ends on a digit. + * 5) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the + * unique id of a partition if the partition table provides it. + * The UUID may be either an EFI/GPT UUID, or refer to an MSDOS + * partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero- + * filled hex representation of the 32-bit "NT disk signature", and PP + * is a zero-filled hex representation of the 1-based partition number. + * 6) PARTUUID=/PARTNROFF= to select a partition in relation to + * a partition with a known unique id. + * 7) : major and minor number of the device separated by + * a colon. + */ +static char blkdev[80] = CONFIG_PSTORE_BLK_BLKDEV; +module_param_string(blkdev, blkdev, 80, 0400); +MODULE_PARM_DESC(blkdev, "block device for pstore storage"); + +/* + * All globals must only be accessed under the pstore_blk_lock + * during the register/unregister functions. + */ +static DEFINE_MUTEX(pstore_blk_lock); +static struct block_device *psblk_bdev; +static struct pstore_zone_info *pstore_zone_info; +static pstore_blk_panic_write_op blkdev_panic_write; + +struct bdev_info { + dev_t devt; + sector_t nr_sects; + sector_t start_sect; +}; + +/** + * struct pstore_device_info - back-end pstore/blk driver structure. + * + * @total_size: The total size in bytes pstore/blk can use. It must be greater + * than 4096 and be multiple of 4096. + * @flags: Refer to macro starting with PSTORE_FLAGS defined in + * linux/pstore.h. It means what front-ends this device support. + * Zero means all backends for compatible. + * @read: The general read operation. Both of the function parameters + * @size and @offset are relative value to bock device (not the + * whole disk). + * On success, the number of bytes should be returned, others + * means error. + * @write: The same as @read. + * @panic_write:The write operation only used for panic case. It's optional + * if you do not care panic log. The parameters and return value + * are the same as @read. + */ +struct pstore_device_info { + unsigned long total_size; + unsigned int flags; + pstore_zone_read_op read; + pstore_zone_write_op write; + pstore_zone_write_op panic_write; +}; + +static int psblk_register_do(struct pstore_device_info *dev) +{ + int ret; + + if (!dev || !dev->total_size || !dev->read || !dev->write) + return -EINVAL; + + mutex_lock(&pstore_blk_lock); + + /* someone already registered before */ + if (pstore_zone_info) { + mutex_unlock(&pstore_blk_lock); + return -EBUSY; + } + pstore_zone_info = kzalloc(sizeof(struct pstore_zone_info), GFP_KERNEL); + if (!pstore_zone_info) { + mutex_unlock(&pstore_blk_lock); + return -ENOMEM; + } + + /* zero means not limit on which backends to attempt to store. */ + if (!dev->flags) + dev->flags = UINT_MAX; + +#define verify_size(name, alignsize, enabled) { \ + long _##name_ = (enabled) ? (name) : 0; \ + _##name_ = _##name_ <= 0 ? 0 : (_##name_ * 1024); \ + if (_##name_ & ((alignsize) - 1)) { \ + pr_info(#name " must align to %d\n", \ + (alignsize)); \ + _##name_ = ALIGN(name, (alignsize)); \ + } \ + name = _##name_ / 1024; \ + pstore_zone_info->name = _##name_; \ + } + + verify_size(kmsg_size, 4096, dev->flags & PSTORE_FLAGS_DMESG); +#undef verify_size + + pstore_zone_info->total_size = dev->total_size; + pstore_zone_info->max_reason = max_reason; + pstore_zone_info->read = dev->read; + pstore_zone_info->write = dev->write; + pstore_zone_info->panic_write = dev->panic_write; + pstore_zone_info->name = KBUILD_MODNAME; + pstore_zone_info->owner = THIS_MODULE; + + ret = register_pstore_zone(pstore_zone_info); + if (ret) { + kfree(pstore_zone_info); + pstore_zone_info = NULL; + } + mutex_unlock(&pstore_blk_lock); + return ret; +} + +static void psblk_unregister_do(struct pstore_device_info *dev) +{ + mutex_lock(&pstore_blk_lock); + if (pstore_zone_info && pstore_zone_info->read == dev->read) { + unregister_pstore_zone(pstore_zone_info); + kfree(pstore_zone_info); + pstore_zone_info = NULL; + } + mutex_unlock(&pstore_blk_lock); +} + +/** + * psblk_get_bdev() - open block device + * + * @holder: Exclusive holder identifier + * @info: Information about bdev to fill in + * + * Return: pointer to block device on success and others on error. + * + * On success, the returned block_device has reference count of one. + */ +static struct block_device *psblk_get_bdev(void *holder, + struct bdev_info *info) +{ + struct block_device *bdev = ERR_PTR(-ENODEV); + fmode_t mode = FMODE_READ | FMODE_WRITE; + sector_t nr_sects; + + lockdep_assert_held(&pstore_blk_lock); + + if (pstore_zone_info) + return ERR_PTR(-EBUSY); + + if (!blkdev[0]) + return ERR_PTR(-ENODEV); + + if (holder) + mode |= FMODE_EXCL; + bdev = blkdev_get_by_path(blkdev, mode, holder); + if (IS_ERR(bdev)) { + dev_t devt; + + devt = name_to_dev_t(blkdev); + if (devt == 0) + return ERR_PTR(-ENODEV); + bdev = blkdev_get_by_dev(devt, mode, holder); + if (IS_ERR(bdev)) + return bdev; + } + + nr_sects = part_nr_sects_read(bdev->bd_part); + if (!nr_sects) { + pr_err("not enough space for '%s'\n", blkdev); + blkdev_put(bdev, mode); + return ERR_PTR(-ENOSPC); + } + + if (info) { + info->devt = bdev->bd_dev; + info->nr_sects = nr_sects; + info->start_sect = get_start_sect(bdev); + } + + return bdev; +} + +static void psblk_put_bdev(struct block_device *bdev, void *holder) +{ + fmode_t mode = FMODE_READ | FMODE_WRITE; + + lockdep_assert_held(&pstore_blk_lock); + + if (!bdev) + return; + + if (holder) + mode |= FMODE_EXCL; + blkdev_put(bdev, mode); +} + +static ssize_t psblk_generic_blk_read(char *buf, size_t bytes, loff_t pos) +{ + struct block_device *bdev = psblk_bdev; + struct file file; + struct kiocb kiocb; + struct iov_iter iter; + struct kvec iov = {.iov_base = buf, .iov_len = bytes}; + + if (!bdev) + return -ENODEV; + + memset(&file, 0, sizeof(struct file)); + file.f_mapping = bdev->bd_inode->i_mapping; + file.f_flags = O_DSYNC | __O_SYNC | O_NOATIME; + file.f_inode = bdev->bd_inode; + file_ra_state_init(&file.f_ra, file.f_mapping); + + init_sync_kiocb(&kiocb, &file); + kiocb.ki_pos = pos; + iov_iter_kvec(&iter, READ, &iov, 1, bytes); + + return generic_file_read_iter(&kiocb, &iter); +} + +static ssize_t psblk_generic_blk_write(const char *buf, size_t bytes, + loff_t pos) +{ + struct block_device *bdev = psblk_bdev; + struct iov_iter iter; + struct kiocb kiocb; + struct file file; + ssize_t ret; + struct kvec iov = {.iov_base = (void *)buf, .iov_len = bytes}; + + if (!bdev) + return -ENODEV; + + /* Console/Ftrace backend may handle buffer until flush dirty zones */ + if (in_interrupt() || irqs_disabled()) + return -EBUSY; + + memset(&file, 0, sizeof(struct file)); + file.f_mapping = bdev->bd_inode->i_mapping; + file.f_flags = O_DSYNC | __O_SYNC | O_NOATIME; + file.f_inode = bdev->bd_inode; + + init_sync_kiocb(&kiocb, &file); + kiocb.ki_pos = pos; + iov_iter_kvec(&iter, WRITE, &iov, 1, bytes); + + inode_lock(bdev->bd_inode); + ret = generic_write_checks(&kiocb, &iter); + if (ret > 0) + ret = generic_perform_write(&file, &iter, pos); + inode_unlock(bdev->bd_inode); + + if (likely(ret > 0)) { + const struct file_operations f_op = {.fsync = blkdev_fsync}; + + file.f_op = &f_op; + kiocb.ki_pos += ret; + ret = generic_write_sync(&kiocb, ret); + } + return ret; +} + +static ssize_t psblk_blk_panic_write(const char *buf, size_t size, + loff_t off) +{ + int ret; + + if (!blkdev_panic_write) + return -EOPNOTSUPP; + + /* size and off must align to SECTOR_SIZE for block device */ + ret = blkdev_panic_write(buf, off >> SECTOR_SHIFT, + size >> SECTOR_SHIFT); + return ret ? -EIO : size; +} + +static int __register_pstore_blk(struct pstore_blk_info *info) +{ + char bdev_name[BDEVNAME_SIZE]; + struct block_device *bdev; + struct pstore_device_info dev; + struct bdev_info binfo; + void *holder = blkdev; + int ret = -ENODEV; + + lockdep_assert_held(&pstore_blk_lock); + + /* hold bdev exclusively */ + memset(&binfo, 0, sizeof(binfo)); + bdev = psblk_get_bdev(holder, &binfo); + if (IS_ERR(bdev)) { + pr_err("failed to open '%s'!\n", blkdev); + return PTR_ERR(bdev); + } + + /* only allow driver matching the @blkdev */ + if (!binfo.devt || MAJOR(binfo.devt) != info->major) { + pr_debug("invalid major %u (expect %u)\n", + info->major, MAJOR(binfo.devt)); + ret = -ENODEV; + goto err_put_bdev; + } + + /* psblk_bdev must be assigned before register to pstore/blk */ + psblk_bdev = bdev; + blkdev_panic_write = info->panic_write; + + /* Copy back block device details. */ + info->devt = binfo.devt; + info->nr_sects = binfo.nr_sects; + info->start_sect = binfo.start_sect; + + memset(&dev, 0, sizeof(dev)); + dev.total_size = info->nr_sects << SECTOR_SHIFT; + dev.flags = info->flags; + dev.read = psblk_generic_blk_read; + dev.write = psblk_generic_blk_write; + dev.panic_write = info->panic_write ? psblk_blk_panic_write : NULL; + + ret = psblk_register_do(&dev); + if (ret) + goto err_put_bdev; + + bdevname(bdev, bdev_name); + pr_info("attached %s%s\n", bdev_name, + info->panic_write ? "" : " (no dedicated panic_write!)"); + return 0; + +err_put_bdev: + psblk_bdev = NULL; + blkdev_panic_write = NULL; + psblk_put_bdev(bdev, holder); + return ret; +} + +/** + * register_pstore_blk() - register block device to pstore/blk + * + * @info: details on the desired block device interface + * + * Return: + * * 0 - OK + * * Others - something error. + */ +int register_pstore_blk(struct pstore_blk_info *info) +{ + int ret; + + mutex_lock(&pstore_blk_lock); + ret = __register_pstore_blk(info); + mutex_unlock(&pstore_blk_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(register_pstore_blk); + +static void __unregister_pstore_blk(unsigned int major) +{ + struct pstore_device_info dev = { .read = psblk_generic_blk_read }; + void *holder = blkdev; + + lockdep_assert_held(&pstore_blk_lock); + if (psblk_bdev && MAJOR(psblk_bdev->bd_dev) == major) { + psblk_unregister_do(&dev); + psblk_put_bdev(psblk_bdev, holder); + blkdev_panic_write = NULL; + psblk_bdev = NULL; + } +} + +/** + * unregister_pstore_blk() - unregister block device from pstore/blk + * + * @major: the major device number of device + */ +void unregister_pstore_blk(unsigned int major) +{ + mutex_lock(&pstore_blk_lock); + __unregister_pstore_blk(major); + mutex_unlock(&pstore_blk_lock); +} +EXPORT_SYMBOL_GPL(unregister_pstore_blk); + +static void __exit pstore_blk_exit(void) +{ + mutex_lock(&pstore_blk_lock); + if (psblk_bdev) + __unregister_pstore_blk(MAJOR(psblk_bdev->bd_dev)); + mutex_unlock(&pstore_blk_lock); +} +module_exit(pstore_blk_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("WeiXiong Liao "); +MODULE_AUTHOR("Kees Cook "); +MODULE_DESCRIPTION("pstore backend for block devices"); diff --git a/include/linux/pstore_blk.h b/include/linux/pstore_blk.h new file mode 100644 index 000000000000..4501977b1336 --- /dev/null +++ b/include/linux/pstore_blk.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __PSTORE_BLK_H_ +#define __PSTORE_BLK_H_ + +#include +#include +#include + +/** + * typedef pstore_blk_panic_write_op - panic write operation to block device + * + * @buf: the data to write + * @start_sect: start sector to block device + * @sects: sectors count on buf + * + * Return: On success, zero should be returned. Others mean error. + * + * Panic write to block device must be aligned to SECTOR_SIZE. + */ +typedef int (*pstore_blk_panic_write_op)(const char *buf, sector_t start_sect, + sector_t sects); + +/** + * struct pstore_blk_info - pstore/blk registration details + * + * @major: Which major device number to support with pstore/blk + * @flags: The supported PSTORE_FLAGS_* from linux/pstore.h. + * @panic_write:The write operation only used for the panic case. + * This can be NULL, but is recommended to avoid losing + * crash data if the kernel's IO path or work queues are + * broken during a panic. + * @devt: The dev_t that pstore/blk has attached to. + * @nr_sects: Number of sectors on @devt. + * @start_sect: Starting sector on @devt. + */ +struct pstore_blk_info { + unsigned int major; + unsigned int flags; + pstore_blk_panic_write_op panic_write; + + /* Filled in by pstore/blk after registration. */ + dev_t devt; + sector_t nr_sects; + sector_t start_sect; +}; + +int register_pstore_blk(struct pstore_blk_info *info); +void unregister_pstore_blk(unsigned int major); + +#endif From 0dc068265a1c5923ffebf40388fbe93050a77ad1 Mon Sep 17 00:00:00 2001 From: WeiXiong Liao Date: Wed, 25 Mar 2020 16:54:59 +0800 Subject: [PATCH 35/43] pstore/zone,blk: Add support for pmsg frontend Add pmsg support to pstore/blk (through pstore/zone). To enable, pmsg_size must be greater than 0 and a multiple of 4096. Signed-off-by: WeiXiong Liao Link: https://lore.kernel.org/lkml/20200511233229.27745-4-keescook@chromium.org/ Co-developed-by: Colin Ian King Signed-off-by: Colin Ian King Link: https://lore.kernel.org/lkml/20200512171932.222102-1-colin.king@canonical.com Co-developed-by: Kees Cook Signed-off-by: Kees Cook --- fs/pstore/Kconfig | 12 ++ fs/pstore/blk.c | 9 ++ fs/pstore/zone.c | 268 ++++++++++++++++++++++++++++++++++-- include/linux/pstore_zone.h | 2 + 4 files changed, 282 insertions(+), 9 deletions(-) diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index 958bec75f907..ef01c48f0ff7 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -224,3 +224,15 @@ config PSTORE_BLK_MAX_REASON NOTE that, both Kconfig and module parameters can configure pstore/blk, but module parameters have priority over Kconfig. + +config PSTORE_BLK_PMSG_SIZE + int "Size in Kbytes of pmsg to store" + depends on PSTORE_BLK + depends on PSTORE_PMSG + default 64 + help + This just sets size of pmsg (pmsg_size) for pstore/blk. The size is + in KB and must be a multiple of 4. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index 8f131c6e412e..a9e7078f08d6 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -27,6 +27,14 @@ module_param(max_reason, int, 0400); MODULE_PARM_DESC(max_reason, "maximum reason for kmsg dump (default 2: Oops and Panic)"); +#if IS_ENABLED(CONFIG_PSTORE_PMSG) +static long pmsg_size = CONFIG_PSTORE_BLK_PMSG_SIZE; +#else +static long pmsg_size = -1; +#endif +module_param(pmsg_size, long, 0400); +MODULE_PARM_DESC(pmsg_size, "pmsg size in kbytes"); + /* * blkdev - the block device to use for pstore storage * @@ -133,6 +141,7 @@ static int psblk_register_do(struct pstore_device_info *dev) } verify_size(kmsg_size, 4096, dev->flags & PSTORE_FLAGS_DMESG); + verify_size(pmsg_size, 4096, dev->flags & PSTORE_FLAGS_PMSG); #undef verify_size pstore_zone_info->total_size = dev->total_size; diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c index 362d72a24012..2f8e3b349edb 100644 --- a/fs/pstore/zone.c +++ b/fs/pstore/zone.c @@ -27,12 +27,14 @@ * * @sig: signature to indicate header (PSZ_SIG xor PSZONE-type value) * @datalen: length of data in @data + * @start: offset into @data where the beginning of the stored bytes begin * @data: zone data. */ struct psz_buffer { #define PSZ_SIG (0x43474244) /* DBGC */ uint32_t sig; atomic_t datalen; + atomic_t start; uint8_t data[]; }; @@ -88,9 +90,11 @@ struct pstore_zone { * struct psz_context - all about running state of pstore/zone * * @kpszs: kmsg dump storage zones + * @ppsz: pmsg storage zone * @kmsg_max_cnt: max count of @kpszs * @kmsg_read_cnt: counter of total read kmsg dumps * @kmsg_write_cnt: counter of total kmsg dump writes + * @pmsg_read_cnt: counter of total read pmsg zone * @oops_counter: counter of oops dumps * @panic_counter: counter of panic dumps * @recovered: whether finished recovering data from storage @@ -101,9 +105,11 @@ struct pstore_zone { */ struct psz_context { struct pstore_zone **kpszs; + struct pstore_zone *ppsz; unsigned int kmsg_max_cnt; unsigned int kmsg_read_cnt; unsigned int kmsg_write_cnt; + unsigned int pmsg_read_cnt; /* * These counters should be calculated during recovery. * It records the oops/panic times after crashes rather than boots. @@ -143,15 +149,20 @@ static inline int buffer_datalen(struct pstore_zone *zone) return atomic_read(&zone->buffer->datalen); } +static inline int buffer_start(struct pstore_zone *zone) +{ + return atomic_read(&zone->buffer->start); +} + static inline bool is_on_panic(void) { return atomic_read(&pstore_zone_cxt.on_panic); } -static ssize_t psz_zone_read(struct pstore_zone *zone, char *buf, +static ssize_t psz_zone_read_buffer(struct pstore_zone *zone, char *buf, size_t len, unsigned long off) { - if (!buf || !zone->buffer) + if (!buf || !zone || !zone->buffer) return -EINVAL; if (off > zone->buffer_size) return -EINVAL; @@ -160,6 +171,18 @@ static ssize_t psz_zone_read(struct pstore_zone *zone, char *buf, return len; } +static int psz_zone_read_oldbuf(struct pstore_zone *zone, char *buf, + size_t len, unsigned long off) +{ + if (!buf || !zone || !zone->oldbuf) + return -EINVAL; + if (off > zone->buffer_size) + return -EINVAL; + len = min_t(size_t, len, zone->buffer_size - off); + memcpy(buf, zone->oldbuf->data + off, len); + return 0; +} + static int psz_zone_write(struct pstore_zone *zone, enum psz_flush_mode flush_mode, const char *buf, size_t len, unsigned long off) @@ -415,6 +438,93 @@ recover_fail: return ret; } +static int psz_recover_zone(struct psz_context *cxt, struct pstore_zone *zone) +{ + struct pstore_zone_info *info = cxt->pstore_zone_info; + struct psz_buffer *oldbuf, tmpbuf; + int ret = 0; + char *buf; + ssize_t rcnt, len, start, off; + + if (!zone || zone->oldbuf) + return 0; + + if (is_on_panic()) { + /* save data as much as possible */ + psz_flush_dirty_zone(zone); + return 0; + } + + if (unlikely(!info->read)) + return -EINVAL; + + len = sizeof(struct psz_buffer); + rcnt = info->read((char *)&tmpbuf, len, zone->off); + if (rcnt != len) { + pr_debug("read zone %s failed\n", zone->name); + return (int)rcnt < 0 ? (int)rcnt : -EIO; + } + + if (tmpbuf.sig != zone->buffer->sig) { + pr_debug("no valid data in zone %s\n", zone->name); + return 0; + } + + if (zone->buffer_size < atomic_read(&tmpbuf.datalen) || + zone->buffer_size < atomic_read(&tmpbuf.start)) { + pr_info("found overtop zone: %s: off %lld, size %zu\n", + zone->name, zone->off, zone->buffer_size); + /* just keep going */ + return 0; + } + + if (!atomic_read(&tmpbuf.datalen)) { + pr_debug("found erased zone: %s: off %lld, size %zu, datalen %d\n", + zone->name, zone->off, zone->buffer_size, + atomic_read(&tmpbuf.datalen)); + return 0; + } + + pr_debug("found nice zone: %s: off %lld, size %zu, datalen %d\n", + zone->name, zone->off, zone->buffer_size, + atomic_read(&tmpbuf.datalen)); + + len = atomic_read(&tmpbuf.datalen) + sizeof(*oldbuf); + oldbuf = kzalloc(len, GFP_KERNEL); + if (!oldbuf) + return -ENOMEM; + + memcpy(oldbuf, &tmpbuf, sizeof(*oldbuf)); + buf = (char *)oldbuf + sizeof(*oldbuf); + len = atomic_read(&oldbuf->datalen); + start = atomic_read(&oldbuf->start); + off = zone->off + sizeof(*oldbuf); + + /* get part of data */ + rcnt = info->read(buf, len - start, off + start); + if (rcnt != len - start) { + pr_err("read zone %s failed\n", zone->name); + ret = (int)rcnt < 0 ? (int)rcnt : -EIO; + goto free_oldbuf; + } + + /* get the rest of data */ + rcnt = info->read(buf + len - start, start, off); + if (rcnt != start) { + pr_err("read zone %s failed\n", zone->name); + ret = (int)rcnt < 0 ? (int)rcnt : -EIO; + goto free_oldbuf; + } + + zone->oldbuf = oldbuf; + psz_flush_dirty_zone(zone); + return 0; + +free_oldbuf: + kfree(oldbuf); + return ret; +} + /** * psz_recovery() - recover data from storage * @cxt: the context of pstore/zone @@ -431,7 +541,12 @@ static inline int psz_recovery(struct psz_context *cxt) return 0; ret = psz_kmsg_recover(cxt); + if (ret) + goto out; + ret = psz_recover_zone(cxt, cxt->ppsz); + +out: if (unlikely(ret)) pr_err("recover failed\n"); else { @@ -446,9 +561,17 @@ static int psz_pstore_open(struct pstore_info *psi) struct psz_context *cxt = psi->data; cxt->kmsg_read_cnt = 0; + cxt->pmsg_read_cnt = 0; return 0; } +static inline bool psz_old_ok(struct pstore_zone *zone) +{ + if (zone && zone->oldbuf && atomic_read(&zone->oldbuf->datalen)) + return true; + return false; +} + static inline bool psz_ok(struct pstore_zone *zone) { if (zone && zone->buffer && buffer_datalen(zone)) @@ -473,6 +596,25 @@ static inline int psz_kmsg_erase(struct psz_context *cxt, return psz_zone_write(zone, FLUSH_META, NULL, 0, 0); } +static inline int psz_record_erase(struct psz_context *cxt, + struct pstore_zone *zone) +{ + if (unlikely(!psz_old_ok(zone))) + return 0; + + kfree(zone->oldbuf); + zone->oldbuf = NULL; + /* + * if there are new data in zone buffer, that means the old data + * are already invalid. It is no need to flush 0 (erase) to + * block device. + */ + if (!buffer_datalen(zone)) + return psz_zone_write(zone, FLUSH_META, NULL, 0, 0); + psz_flush_dirty_zone(zone); + return 0; +} + static int psz_pstore_erase(struct pstore_record *record) { struct psz_context *cxt = record->psi->data; @@ -482,6 +624,8 @@ static int psz_pstore_erase(struct pstore_record *record) if (record->id >= cxt->kmsg_max_cnt) return -EINVAL; return psz_kmsg_erase(cxt, cxt->kpszs[record->id], record); + case PSTORE_TYPE_PMSG: + return psz_record_erase(cxt, cxt->ppsz); default: return -EINVAL; } @@ -555,6 +699,55 @@ static int notrace psz_kmsg_write(struct psz_context *cxt, return 0; } +static int notrace psz_record_write(struct pstore_zone *zone, + struct pstore_record *record) +{ + size_t start, rem; + bool is_full_data = false; + char *buf; + int cnt; + + if (!zone || !record) + return -ENOSPC; + + if (atomic_read(&zone->buffer->datalen) >= zone->buffer_size) + is_full_data = true; + + cnt = record->size; + buf = record->buf; + if (unlikely(cnt > zone->buffer_size)) { + buf += cnt - zone->buffer_size; + cnt = zone->buffer_size; + } + + start = buffer_start(zone); + rem = zone->buffer_size - start; + if (unlikely(rem < cnt)) { + psz_zone_write(zone, FLUSH_PART, buf, rem, start); + buf += rem; + cnt -= rem; + start = 0; + is_full_data = true; + } + + atomic_set(&zone->buffer->start, cnt + start); + psz_zone_write(zone, FLUSH_PART, buf, cnt, start); + + /** + * psz_zone_write will set datalen as start + cnt. + * It work if actual data length lesser than buffer size. + * If data length greater than buffer size, pmsg will rewrite to + * beginning of zone, which make buffer->datalen wrongly. + * So we should reset datalen as buffer size once actual data length + * greater than buffer size. + */ + if (is_full_data) { + atomic_set(&zone->buffer->datalen, zone->buffer_size); + psz_zone_write(zone, FLUSH_META, NULL, 0, 0); + } + return 0; +} + static int notrace psz_pstore_write(struct pstore_record *record) { struct psz_context *cxt = record->psi->data; @@ -566,6 +759,8 @@ static int notrace psz_pstore_write(struct pstore_record *record) switch (record->type) { case PSTORE_TYPE_DMESG: return psz_kmsg_write(cxt, record); + case PSTORE_TYPE_PMSG: + return psz_record_write(cxt->ppsz, record); default: return -EINVAL; } @@ -581,6 +776,13 @@ static struct pstore_zone *psz_read_next_zone(struct psz_context *cxt) return zone; } + if (cxt->pmsg_read_cnt == 0) { + cxt->pmsg_read_cnt++; + zone = cxt->ppsz; + if (psz_old_ok(zone)) + return zone; + } + return NULL; } @@ -631,7 +833,7 @@ static ssize_t psz_kmsg_read(struct pstore_zone *zone, return -ENOMEM; } - size = psz_zone_read(zone, record->buf + hlen, size, + size = psz_zone_read_buffer(zone, record->buf + hlen, size, sizeof(struct psz_kmsg_header)); if (unlikely(size < 0)) { kfree(record->buf); @@ -641,6 +843,32 @@ static ssize_t psz_kmsg_read(struct pstore_zone *zone, return size + hlen; } +static ssize_t psz_record_read(struct pstore_zone *zone, + struct pstore_record *record) +{ + size_t len; + struct psz_buffer *buf; + + if (!zone || !record) + return -ENOSPC; + + buf = (struct psz_buffer *)zone->oldbuf; + if (!buf) + return -ENOMSG; + + len = atomic_read(&buf->datalen); + record->buf = kmalloc(len, GFP_KERNEL); + if (!record->buf) + return -ENOMEM; + + if (unlikely(psz_zone_read_oldbuf(zone, record->buf, len, 0))) { + kfree(record->buf); + return -ENOMSG; + } + + return len; +} + static ssize_t psz_pstore_read(struct pstore_record *record) { struct psz_context *cxt = record->psi->data; @@ -665,6 +893,9 @@ next_zone: readop = psz_kmsg_read; record->id = cxt->kmsg_read_cnt - 1; break; + case PSTORE_TYPE_PMSG: + readop = psz_record_read; + break; default: goto next_zone; } @@ -720,6 +951,8 @@ static void psz_free_all_zones(struct psz_context *cxt) { if (cxt->kpszs) psz_free_zones(&cxt->kpszs, &cxt->kmsg_max_cnt); + if (cxt->ppsz) + psz_free_zone(&cxt->ppsz); } static struct pstore_zone *psz_init_zone(enum pstore_type_id type, @@ -753,8 +986,10 @@ static struct pstore_zone *psz_init_zone(enum pstore_type_id type, zone->type = type; zone->buffer_size = size - sizeof(struct psz_buffer); zone->buffer->sig = type ^ PSZ_SIG; + zone->oldbuf = NULL; atomic_set(&zone->dirty, 0); atomic_set(&zone->buffer->datalen, 0); + atomic_set(&zone->buffer->start, 0); *off += size; @@ -809,19 +1044,28 @@ static int psz_alloc_zones(struct psz_context *cxt) struct pstore_zone_info *info = cxt->pstore_zone_info; loff_t off = 0; int err; - size_t size; + size_t off_size = 0; - size = info->total_size; - cxt->kpszs = psz_init_zones(PSTORE_TYPE_DMESG, &off, size, + off_size += info->pmsg_size; + cxt->ppsz = psz_init_zone(PSTORE_TYPE_PMSG, &off, info->pmsg_size); + if (IS_ERR(cxt->ppsz)) { + err = PTR_ERR(cxt->ppsz); + cxt->ppsz = NULL; + goto free_out; + } + + cxt->kpszs = psz_init_zones(PSTORE_TYPE_DMESG, &off, + info->total_size - off_size, info->kmsg_size, &cxt->kmsg_max_cnt); if (IS_ERR(cxt->kpszs)) { err = PTR_ERR(cxt->kpszs); cxt->kpszs = NULL; - goto fail_out; + goto free_out; } return 0; -fail_out: +free_out: + psz_free_all_zones(cxt); return err; } @@ -844,7 +1088,7 @@ int register_pstore_zone(struct pstore_zone_info *info) return -EINVAL; } - if (!info->kmsg_size) { + if (!info->kmsg_size && !info->pmsg_size) { pr_warn("at least one record size must be non-zero\n"); return -EINVAL; } @@ -866,6 +1110,7 @@ int register_pstore_zone(struct pstore_zone_info *info) check_size(total_size, 4096); check_size(kmsg_size, SECTOR_SIZE); + check_size(pmsg_size, SECTOR_SIZE); #undef check_size @@ -891,6 +1136,7 @@ int register_pstore_zone(struct pstore_zone_info *info) pr_debug("register %s with properties:\n", info->name); pr_debug("\ttotal size : %ld Bytes\n", info->total_size); pr_debug("\tkmsg size : %ld Bytes\n", info->kmsg_size); + pr_debug("\tpmsg size : %ld Bytes\n", info->pmsg_size); err = psz_alloc_zones(cxt); if (err) { @@ -920,6 +1166,10 @@ int register_pstore_zone(struct pstore_zone_info *info) pr_cont(",panic_write"); pr_cont(")"); } + if (info->pmsg_size) { + cxt->pstore.flags |= PSTORE_FLAGS_PMSG; + pr_cont(" pmsg"); + } pr_cont("\n"); err = pstore_register(&cxt->pstore); diff --git a/include/linux/pstore_zone.h b/include/linux/pstore_zone.h index eb005d9ae40c..29c367a3bd80 100644 --- a/include/linux/pstore_zone.h +++ b/include/linux/pstore_zone.h @@ -17,6 +17,7 @@ typedef ssize_t (*pstore_zone_write_op)(const char *, size_t, loff_t); * @kmsg_size: The size of oops/panic zone. Zero means disabled, otherwise, * it must be multiple of SECTOR_SIZE(512 Bytes). * @max_reason: Maximum kmsg dump reason to store. + * @pmsg_size: The size of pmsg zone which is the same as @kmsg_size. * @read: The general read operation. Both of the function parameters * @size and @offset are relative value to storage. * On success, the number of bytes should be returned, others @@ -33,6 +34,7 @@ struct pstore_zone_info { unsigned long total_size; unsigned long kmsg_size; int max_reason; + unsigned long pmsg_size; pstore_zone_read_op read; pstore_zone_write_op write; pstore_zone_write_op panic_write; From cc9c4d1b5597167f8e8c92f6b61e1cda6d01884d Mon Sep 17 00:00:00 2001 From: WeiXiong Liao Date: Wed, 25 Mar 2020 16:55:00 +0800 Subject: [PATCH 36/43] pstore/zone,blk: Add console frontend support Support backend for console. To enable console backend, just make console_size be greater than 0 and a multiple of 4096. Signed-off-by: WeiXiong Liao Link: https://lore.kernel.org/lkml/20200511233229.27745-5-keescook@chromium.org/ Signed-off-by: Kees Cook --- fs/pstore/Kconfig | 18 +++++++-- fs/pstore/blk.c | 12 +++++- fs/pstore/zone.c | 81 ++++++++++++++++++++++++++++++++++--- include/linux/pstore_zone.h | 4 +- 4 files changed, 105 insertions(+), 10 deletions(-) diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index ef01c48f0ff7..126aa6c3ecf2 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -180,11 +180,11 @@ config PSTORE_BLK_BLKDEV help Which block device should be used for pstore/blk. - It accept the following variants: + It accepts the following variants: 1) device number in hexadecimal representation, with no leading 0x, for example b302. - 2) /dev/ represents the device number of disk - 3) /dev/ represents the device number + 2) /dev/ represents the device name of disk + 3) /dev/ represents the device name and number of partition - device number of disk plus the partition number 4) /dev/p - same as the above, this form is used when disk name of partitioned disk ends with a digit. @@ -236,3 +236,15 @@ config PSTORE_BLK_PMSG_SIZE NOTE that, both Kconfig and module parameters can configure pstore/blk, but module parameters have priority over Kconfig. + +config PSTORE_BLK_CONSOLE_SIZE + int "Size in Kbytes of console log to store" + depends on PSTORE_BLK + depends on PSTORE_CONSOLE + default 64 + help + This just sets size of console log (console_size) to store via + pstore/blk. The size is in KB and must be a multiple of 4. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index a9e7078f08d6..082f6cdaf4bd 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -35,6 +35,14 @@ static long pmsg_size = -1; module_param(pmsg_size, long, 0400); MODULE_PARM_DESC(pmsg_size, "pmsg size in kbytes"); +#if IS_ENABLED(CONFIG_PSTORE_CONSOLE) +static long console_size = CONFIG_PSTORE_BLK_CONSOLE_SIZE; +#else +static long console_size = -1; +#endif +module_param(console_size, long, 0400); +MODULE_PARM_DESC(console_size, "console size in kbytes"); + /* * blkdev - the block device to use for pstore storage * @@ -91,7 +99,8 @@ struct bdev_info { * whole disk). * On success, the number of bytes should be returned, others * means error. - * @write: The same as @read. + * @write: The same as @read, but the following error number: + * -EBUSY means try to write again later. * @panic_write:The write operation only used for panic case. It's optional * if you do not care panic log. The parameters and return value * are the same as @read. @@ -142,6 +151,7 @@ static int psblk_register_do(struct pstore_device_info *dev) verify_size(kmsg_size, 4096, dev->flags & PSTORE_FLAGS_DMESG); verify_size(pmsg_size, 4096, dev->flags & PSTORE_FLAGS_PMSG); + verify_size(console_size, 4096, dev->flags & PSTORE_FLAGS_CONSOLE); #undef verify_size pstore_zone_info->total_size = dev->total_size; diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c index 2f8e3b349edb..ee0b64da410d 100644 --- a/fs/pstore/zone.c +++ b/fs/pstore/zone.c @@ -91,10 +91,12 @@ struct pstore_zone { * * @kpszs: kmsg dump storage zones * @ppsz: pmsg storage zone + * @cpsz: console storage zone * @kmsg_max_cnt: max count of @kpszs * @kmsg_read_cnt: counter of total read kmsg dumps * @kmsg_write_cnt: counter of total kmsg dump writes * @pmsg_read_cnt: counter of total read pmsg zone + * @console_read_cnt: counter of total read console zone * @oops_counter: counter of oops dumps * @panic_counter: counter of panic dumps * @recovered: whether finished recovering data from storage @@ -106,10 +108,12 @@ struct pstore_zone { struct psz_context { struct pstore_zone **kpszs; struct pstore_zone *ppsz; + struct pstore_zone *cpsz; unsigned int kmsg_max_cnt; unsigned int kmsg_read_cnt; unsigned int kmsg_write_cnt; unsigned int pmsg_read_cnt; + unsigned int console_read_cnt; /* * These counters should be calculated during recovery. * It records the oops/panic times after crashes rather than boots. @@ -129,6 +133,9 @@ struct psz_context { }; static struct psz_context pstore_zone_cxt; +static void psz_flush_all_dirty_zones(struct work_struct *); +static DECLARE_DELAYED_WORK(psz_cleaner, psz_flush_all_dirty_zones); + /** * enum psz_flush_mode - flush mode for psz_zone_write() * @@ -237,6 +244,9 @@ static int psz_zone_write(struct pstore_zone *zone, return 0; dirty: atomic_set(&zone->dirty, true); + /* flush dirty zones nicely */ + if (wcnt == -EBUSY && !is_on_panic()) + schedule_delayed_work(&psz_cleaner, msecs_to_jiffies(500)); return -EBUSY; } @@ -293,6 +303,21 @@ static int psz_move_zone(struct pstore_zone *old, struct pstore_zone *new) return 0; } +static void psz_flush_all_dirty_zones(struct work_struct *work) +{ + struct psz_context *cxt = &pstore_zone_cxt; + int ret = 0; + + if (cxt->ppsz) + ret |= psz_flush_dirty_zone(cxt->ppsz); + if (cxt->cpsz) + ret |= psz_flush_dirty_zone(cxt->cpsz); + if (cxt->kpszs) + ret |= psz_flush_dirty_zones(cxt->kpszs, cxt->kmsg_max_cnt); + if (ret && cxt->pstore_zone_info) + schedule_delayed_work(&psz_cleaner, msecs_to_jiffies(1000)); +} + static int psz_kmsg_recover_data(struct psz_context *cxt) { struct pstore_zone_info *info = cxt->pstore_zone_info; @@ -545,6 +570,10 @@ static inline int psz_recovery(struct psz_context *cxt) goto out; ret = psz_recover_zone(cxt, cxt->ppsz); + if (ret) + goto out; + + ret = psz_recover_zone(cxt, cxt->cpsz); out: if (unlikely(ret)) @@ -562,6 +591,7 @@ static int psz_pstore_open(struct pstore_info *psi) cxt->kmsg_read_cnt = 0; cxt->pmsg_read_cnt = 0; + cxt->console_read_cnt = 0; return 0; } @@ -626,8 +656,9 @@ static int psz_pstore_erase(struct pstore_record *record) return psz_kmsg_erase(cxt, cxt->kpszs[record->id], record); case PSTORE_TYPE_PMSG: return psz_record_erase(cxt, cxt->ppsz); - default: - return -EINVAL; + case PSTORE_TYPE_CONSOLE: + return psz_record_erase(cxt, cxt->cpsz); + default: return -EINVAL; } } @@ -690,9 +721,10 @@ static int notrace psz_kmsg_write(struct psz_context *cxt, return -ENOSPC; ret = psz_kmsg_write_record(cxt, record); - if (!ret) { + if (!ret && is_on_panic()) { + /* ensure all data are flushed to storage when panic */ pr_debug("try to flush other dirty zones\n"); - psz_flush_dirty_zones(cxt->kpszs, cxt->kmsg_max_cnt); + psz_flush_all_dirty_zones(NULL); } /* always return 0 as we had handled it on buffer */ @@ -756,9 +788,18 @@ static int notrace psz_pstore_write(struct pstore_record *record) record->reason == KMSG_DUMP_PANIC) atomic_set(&cxt->on_panic, 1); + /* + * if on panic, do not write except panic records + * Fix case that panic_write prints log which wakes up console backend. + */ + if (is_on_panic() && record->type != PSTORE_TYPE_DMESG) + return -EBUSY; + switch (record->type) { case PSTORE_TYPE_DMESG: return psz_kmsg_write(cxt, record); + case PSTORE_TYPE_CONSOLE: + return psz_record_write(cxt->cpsz, record); case PSTORE_TYPE_PMSG: return psz_record_write(cxt->ppsz, record); default: @@ -783,6 +824,13 @@ static struct pstore_zone *psz_read_next_zone(struct psz_context *cxt) return zone; } + if (cxt->console_read_cnt == 0) { + cxt->console_read_cnt++; + zone = cxt->cpsz; + if (psz_old_ok(zone)) + return zone; + } + return NULL; } @@ -893,6 +941,8 @@ next_zone: readop = psz_kmsg_read; record->id = cxt->kmsg_read_cnt - 1; break; + case PSTORE_TYPE_CONSOLE: + fallthrough; case PSTORE_TYPE_PMSG: readop = psz_record_read; break; @@ -953,6 +1003,8 @@ static void psz_free_all_zones(struct psz_context *cxt) psz_free_zones(&cxt->kpszs, &cxt->kmsg_max_cnt); if (cxt->ppsz) psz_free_zone(&cxt->ppsz); + if (cxt->cpsz) + psz_free_zone(&cxt->cpsz); } static struct pstore_zone *psz_init_zone(enum pstore_type_id type, @@ -1054,6 +1106,15 @@ static int psz_alloc_zones(struct psz_context *cxt) goto free_out; } + off_size += info->console_size; + cxt->cpsz = psz_init_zone(PSTORE_TYPE_CONSOLE, &off, + info->console_size); + if (IS_ERR(cxt->cpsz)) { + err = PTR_ERR(cxt->cpsz); + cxt->cpsz = NULL; + goto free_out; + } + cxt->kpszs = psz_init_zones(PSTORE_TYPE_DMESG, &off, info->total_size - off_size, info->kmsg_size, &cxt->kmsg_max_cnt); @@ -1088,7 +1149,7 @@ int register_pstore_zone(struct pstore_zone_info *info) return -EINVAL; } - if (!info->kmsg_size && !info->pmsg_size) { + if (!info->kmsg_size && !info->pmsg_size && !info->console_size) { pr_warn("at least one record size must be non-zero\n"); return -EINVAL; } @@ -1111,6 +1172,7 @@ int register_pstore_zone(struct pstore_zone_info *info) check_size(total_size, 4096); check_size(kmsg_size, SECTOR_SIZE); check_size(pmsg_size, SECTOR_SIZE); + check_size(console_size, SECTOR_SIZE); #undef check_size @@ -1137,6 +1199,7 @@ int register_pstore_zone(struct pstore_zone_info *info) pr_debug("\ttotal size : %ld Bytes\n", info->total_size); pr_debug("\tkmsg size : %ld Bytes\n", info->kmsg_size); pr_debug("\tpmsg size : %ld Bytes\n", info->pmsg_size); + pr_debug("\tconsole size : %ld Bytes\n", info->console_size); err = psz_alloc_zones(cxt); if (err) { @@ -1170,6 +1233,10 @@ int register_pstore_zone(struct pstore_zone_info *info) cxt->pstore.flags |= PSTORE_FLAGS_PMSG; pr_cont(" pmsg"); } + if (info->console_size) { + cxt->pstore.flags |= PSTORE_FLAGS_CONSOLE; + pr_cont(" console"); + } pr_cont("\n"); err = pstore_register(&cxt->pstore); @@ -1211,6 +1278,10 @@ void unregister_pstore_zone(struct pstore_zone_info *info) /* Stop incoming writes from pstore. */ pstore_unregister(&cxt->pstore); + /* Flush any pending writes. */ + psz_flush_all_dirty_zones(NULL); + flush_delayed_work(&psz_cleaner); + /* Clean up allocations. */ kfree(cxt->pstore.buf); cxt->pstore.buf = NULL; diff --git a/include/linux/pstore_zone.h b/include/linux/pstore_zone.h index 29c367a3bd80..904ee67f4ba2 100644 --- a/include/linux/pstore_zone.h +++ b/include/linux/pstore_zone.h @@ -18,11 +18,12 @@ typedef ssize_t (*pstore_zone_write_op)(const char *, size_t, loff_t); * it must be multiple of SECTOR_SIZE(512 Bytes). * @max_reason: Maximum kmsg dump reason to store. * @pmsg_size: The size of pmsg zone which is the same as @kmsg_size. + * @console_size:The size of console zone which is the same as @kmsg_size. * @read: The general read operation. Both of the function parameters * @size and @offset are relative value to storage. * On success, the number of bytes should be returned, others * means error. - * @write: The same as @read. + * @write: The same as @read, but -EBUSY means try to write again later. * @panic_write:The write operation only used for panic case. It's optional * if you do not care panic log. The parameters and return value * are the same as @read. @@ -35,6 +36,7 @@ struct pstore_zone_info { unsigned long kmsg_size; int max_reason; unsigned long pmsg_size; + unsigned long console_size; pstore_zone_read_op read; pstore_zone_write_op write; pstore_zone_write_op panic_write; From 34327e9fd213414b35eb70aa512c4e39b2095907 Mon Sep 17 00:00:00 2001 From: WeiXiong Liao Date: Wed, 25 Mar 2020 16:55:01 +0800 Subject: [PATCH 37/43] pstore/zone,blk: Add ftrace frontend support Support backend for ftrace. To enable ftrace backend, just make ftrace_size be greater than 0 and a multiple of 4096. Signed-off-by: WeiXiong Liao Link: https://lore.kernel.org/lkml/20200511233229.27745-6-keescook@chromium.org/ Co-developed-by: Colin Ian King Signed-off-by: Colin Ian King Link: https://lore.kernel.org/lkml/20200512170719.221514-1-colin.king@canonical.com Signed-off-by: Kees Cook --- fs/pstore/Kconfig | 12 ++++ fs/pstore/blk.c | 9 +++ fs/pstore/zone.c | 114 +++++++++++++++++++++++++++++++++++- include/linux/pstore_zone.h | 2 + 4 files changed, 136 insertions(+), 1 deletion(-) diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index 126aa6c3ecf2..c2237984b407 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -248,3 +248,15 @@ config PSTORE_BLK_CONSOLE_SIZE NOTE that, both Kconfig and module parameters can configure pstore/blk, but module parameters have priority over Kconfig. + +config PSTORE_BLK_FTRACE_SIZE + int "Size in Kbytes of ftrace log to store" + depends on PSTORE_BLK + depends on PSTORE_FTRACE + default 64 + help + This just sets size of ftrace log (ftrace_size) for pstore/blk. The + size is in KB and must be a multiple of 4. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index 082f6cdaf4bd..e0d95fb48428 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -43,6 +43,14 @@ static long console_size = -1; module_param(console_size, long, 0400); MODULE_PARM_DESC(console_size, "console size in kbytes"); +#if IS_ENABLED(CONFIG_PSTORE_FTRACE) +static long ftrace_size = CONFIG_PSTORE_BLK_FTRACE_SIZE; +#else +static long ftrace_size = -1; +#endif +module_param(ftrace_size, long, 0400); +MODULE_PARM_DESC(ftrace_size, "ftrace size in kbytes"); + /* * blkdev - the block device to use for pstore storage * @@ -152,6 +160,7 @@ static int psblk_register_do(struct pstore_device_info *dev) verify_size(kmsg_size, 4096, dev->flags & PSTORE_FLAGS_DMESG); verify_size(pmsg_size, 4096, dev->flags & PSTORE_FLAGS_PMSG); verify_size(console_size, 4096, dev->flags & PSTORE_FLAGS_CONSOLE); + verify_size(ftrace_size, 4096, dev->flags & PSTORE_FLAGS_FTRACE); #undef verify_size pstore_zone_info->total_size = dev->total_size; diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c index ee0b64da410d..2d9f452360bb 100644 --- a/fs/pstore/zone.c +++ b/fs/pstore/zone.c @@ -92,11 +92,14 @@ struct pstore_zone { * @kpszs: kmsg dump storage zones * @ppsz: pmsg storage zone * @cpsz: console storage zone + * @fpszs: ftrace storage zones * @kmsg_max_cnt: max count of @kpszs * @kmsg_read_cnt: counter of total read kmsg dumps * @kmsg_write_cnt: counter of total kmsg dump writes * @pmsg_read_cnt: counter of total read pmsg zone * @console_read_cnt: counter of total read console zone + * @ftrace_max_cnt: max count of @fpszs + * @ftrace_read_cnt: counter of max read ftrace zone * @oops_counter: counter of oops dumps * @panic_counter: counter of panic dumps * @recovered: whether finished recovering data from storage @@ -109,11 +112,14 @@ struct psz_context { struct pstore_zone **kpszs; struct pstore_zone *ppsz; struct pstore_zone *cpsz; + struct pstore_zone **fpszs; unsigned int kmsg_max_cnt; unsigned int kmsg_read_cnt; unsigned int kmsg_write_cnt; unsigned int pmsg_read_cnt; unsigned int console_read_cnt; + unsigned int ftrace_max_cnt; + unsigned int ftrace_read_cnt; /* * These counters should be calculated during recovery. * It records the oops/panic times after crashes rather than boots. @@ -314,6 +320,8 @@ static void psz_flush_all_dirty_zones(struct work_struct *work) ret |= psz_flush_dirty_zone(cxt->cpsz); if (cxt->kpszs) ret |= psz_flush_dirty_zones(cxt->kpszs, cxt->kmsg_max_cnt); + if (cxt->fpszs) + ret |= psz_flush_dirty_zones(cxt->fpszs, cxt->ftrace_max_cnt); if (ret && cxt->pstore_zone_info) schedule_delayed_work(&psz_cleaner, msecs_to_jiffies(1000)); } @@ -550,6 +558,31 @@ free_oldbuf: return ret; } +static int psz_recover_zones(struct psz_context *cxt, + struct pstore_zone **zones, unsigned int cnt) +{ + int ret; + unsigned int i; + struct pstore_zone *zone; + + if (!zones) + return 0; + + for (i = 0; i < cnt; i++) { + zone = zones[i]; + if (unlikely(!zone)) + continue; + ret = psz_recover_zone(cxt, zone); + if (ret) + goto recover_fail; + } + + return 0; +recover_fail: + pr_debug("recover %s[%u] failed\n", zone->name, i); + return ret; +} + /** * psz_recovery() - recover data from storage * @cxt: the context of pstore/zone @@ -574,6 +607,10 @@ static inline int psz_recovery(struct psz_context *cxt) goto out; ret = psz_recover_zone(cxt, cxt->cpsz); + if (ret) + goto out; + + ret = psz_recover_zones(cxt, cxt->fpszs, cxt->ftrace_max_cnt); out: if (unlikely(ret)) @@ -592,6 +629,7 @@ static int psz_pstore_open(struct pstore_info *psi) cxt->kmsg_read_cnt = 0; cxt->pmsg_read_cnt = 0; cxt->console_read_cnt = 0; + cxt->ftrace_read_cnt = 0; return 0; } @@ -658,6 +696,10 @@ static int psz_pstore_erase(struct pstore_record *record) return psz_record_erase(cxt, cxt->ppsz); case PSTORE_TYPE_CONSOLE: return psz_record_erase(cxt, cxt->cpsz); + case PSTORE_TYPE_FTRACE: + if (record->id >= cxt->ftrace_max_cnt) + return -EINVAL; + return psz_record_erase(cxt, cxt->fpszs[record->id]); default: return -EINVAL; } } @@ -802,6 +844,13 @@ static int notrace psz_pstore_write(struct pstore_record *record) return psz_record_write(cxt->cpsz, record); case PSTORE_TYPE_PMSG: return psz_record_write(cxt->ppsz, record); + case PSTORE_TYPE_FTRACE: { + int zonenum = smp_processor_id(); + + if (!cxt->fpszs) + return -ENOSPC; + return psz_record_write(cxt->fpszs[zonenum], record); + } default: return -EINVAL; } @@ -817,6 +866,14 @@ static struct pstore_zone *psz_read_next_zone(struct psz_context *cxt) return zone; } + if (cxt->ftrace_read_cnt < cxt->ftrace_max_cnt) + /* + * No need psz_old_ok(). Let psz_ftrace_read() do so for + * combination. psz_ftrace_read() should traverse over + * all zones in case of some zone without data. + */ + return cxt->fpszs[cxt->ftrace_read_cnt++]; + if (cxt->pmsg_read_cnt == 0) { cxt->pmsg_read_cnt++; zone = cxt->ppsz; @@ -891,6 +948,38 @@ static ssize_t psz_kmsg_read(struct pstore_zone *zone, return size + hlen; } +/* try to combine all ftrace zones */ +static ssize_t psz_ftrace_read(struct pstore_zone *zone, + struct pstore_record *record) +{ + struct psz_context *cxt; + struct psz_buffer *buf; + int ret; + + if (!zone || !record) + return -ENOSPC; + + if (!psz_old_ok(zone)) + goto out; + + buf = (struct psz_buffer *)zone->oldbuf; + if (!buf) + return -ENOMSG; + + ret = pstore_ftrace_combine_log(&record->buf, &record->size, + (char *)buf->data, atomic_read(&buf->datalen)); + if (unlikely(ret)) + return ret; + +out: + cxt = record->psi->data; + if (cxt->ftrace_read_cnt < cxt->ftrace_max_cnt) + /* then, read next ftrace zone */ + return -ENOMSG; + record->id = 0; + return record->size ? record->size : -ENOMSG; +} + static ssize_t psz_record_read(struct pstore_zone *zone, struct pstore_record *record) { @@ -941,6 +1030,9 @@ next_zone: readop = psz_kmsg_read; record->id = cxt->kmsg_read_cnt - 1; break; + case PSTORE_TYPE_FTRACE: + readop = psz_ftrace_read; + break; case PSTORE_TYPE_CONSOLE: fallthrough; case PSTORE_TYPE_PMSG: @@ -1005,6 +1097,8 @@ static void psz_free_all_zones(struct psz_context *cxt) psz_free_zone(&cxt->ppsz); if (cxt->cpsz) psz_free_zone(&cxt->cpsz); + if (cxt->fpszs) + psz_free_zones(&cxt->fpszs, &cxt->ftrace_max_cnt); } static struct pstore_zone *psz_init_zone(enum pstore_type_id type, @@ -1115,6 +1209,17 @@ static int psz_alloc_zones(struct psz_context *cxt) goto free_out; } + off_size += info->ftrace_size; + cxt->fpszs = psz_init_zones(PSTORE_TYPE_FTRACE, &off, + info->ftrace_size, + info->ftrace_size / nr_cpu_ids, + &cxt->ftrace_max_cnt); + if (IS_ERR(cxt->fpszs)) { + err = PTR_ERR(cxt->fpszs); + cxt->fpszs = NULL; + goto free_out; + } + cxt->kpszs = psz_init_zones(PSTORE_TYPE_DMESG, &off, info->total_size - off_size, info->kmsg_size, &cxt->kmsg_max_cnt); @@ -1149,7 +1254,8 @@ int register_pstore_zone(struct pstore_zone_info *info) return -EINVAL; } - if (!info->kmsg_size && !info->pmsg_size && !info->console_size) { + if (!info->kmsg_size && !info->pmsg_size && !info->console_size && + !info->ftrace_size) { pr_warn("at least one record size must be non-zero\n"); return -EINVAL; } @@ -1173,6 +1279,7 @@ int register_pstore_zone(struct pstore_zone_info *info) check_size(kmsg_size, SECTOR_SIZE); check_size(pmsg_size, SECTOR_SIZE); check_size(console_size, SECTOR_SIZE); + check_size(ftrace_size, SECTOR_SIZE); #undef check_size @@ -1200,6 +1307,7 @@ int register_pstore_zone(struct pstore_zone_info *info) pr_debug("\tkmsg size : %ld Bytes\n", info->kmsg_size); pr_debug("\tpmsg size : %ld Bytes\n", info->pmsg_size); pr_debug("\tconsole size : %ld Bytes\n", info->console_size); + pr_debug("\tftrace size : %ld Bytes\n", info->ftrace_size); err = psz_alloc_zones(cxt); if (err) { @@ -1237,6 +1345,10 @@ int register_pstore_zone(struct pstore_zone_info *info) cxt->pstore.flags |= PSTORE_FLAGS_CONSOLE; pr_cont(" console"); } + if (info->ftrace_size) { + cxt->pstore.flags |= PSTORE_FLAGS_FTRACE; + pr_cont(" ftrace"); + } pr_cont("\n"); err = pstore_register(&cxt->pstore); diff --git a/include/linux/pstore_zone.h b/include/linux/pstore_zone.h index 904ee67f4ba2..6f16b0dd834a 100644 --- a/include/linux/pstore_zone.h +++ b/include/linux/pstore_zone.h @@ -19,6 +19,7 @@ typedef ssize_t (*pstore_zone_write_op)(const char *, size_t, loff_t); * @max_reason: Maximum kmsg dump reason to store. * @pmsg_size: The size of pmsg zone which is the same as @kmsg_size. * @console_size:The size of console zone which is the same as @kmsg_size. + * @ftrace_size:The size of ftrace zone which is the same as @kmsg_size. * @read: The general read operation. Both of the function parameters * @size and @offset are relative value to storage. * On success, the number of bytes should be returned, others @@ -37,6 +38,7 @@ struct pstore_zone_info { int max_reason; unsigned long pmsg_size; unsigned long console_size; + unsigned long ftrace_size; pstore_zone_read_op read; pstore_zone_write_op write; pstore_zone_write_op panic_write; From 649304c936cd4d2a2128bb877044772416c7d4f5 Mon Sep 17 00:00:00 2001 From: WeiXiong Liao Date: Wed, 25 Mar 2020 16:55:02 +0800 Subject: [PATCH 38/43] Documentation: Add details for pstore/blk Add details on using pstore/blk, the new backend of pstore to record dumps to block devices, in Documentation/admin-guide/pstore-blk.rst Signed-off-by: WeiXiong Liao Link: https://lore.kernel.org/lkml/20200511233229.27745-7-keescook@chromium.org/ Signed-off-by: Kees Cook --- Documentation/admin-guide/pstore-blk.rst | 229 +++++++++++++++++++++++ MAINTAINERS | 1 + fs/pstore/Kconfig | 2 + 3 files changed, 232 insertions(+) create mode 100644 Documentation/admin-guide/pstore-blk.rst diff --git a/Documentation/admin-guide/pstore-blk.rst b/Documentation/admin-guide/pstore-blk.rst new file mode 100644 index 000000000000..bef8c7436721 --- /dev/null +++ b/Documentation/admin-guide/pstore-blk.rst @@ -0,0 +1,229 @@ +.. SPDX-License-Identifier: GPL-2.0 + +pstore block oops/panic logger +============================== + +Introduction +------------ + +pstore block (pstore/blk) is an oops/panic logger that writes its logs to a +block device before the system crashes. You can get these log files by +mounting pstore filesystem like:: + + mount -t pstore pstore /sys/fs/pstore + + +pstore block concepts +--------------------- + +pstore/blk provides efficient configuration method for pstore/blk, which +divides all configurations into two parts, configurations for user and +configurations for driver. + +Configurations for user determine how pstore/blk works, such as pmsg_size, +kmsg_size and so on. All of them support both Kconfig and module parameters, +but module parameters have priority over Kconfig. + +Configurations for driver are all about block device, such as total_size +of block device and read/write operations. + +Configurations for user +----------------------- + +All of these configurations support both Kconfig and module parameters, but +module parameters have priority over Kconfig. + +Here is an example for module parameters:: + + pstore_blk.blkdev=179:7 pstore_blk.kmsg_size=64 + +The detail of each configurations may be of interest to you. + +blkdev +~~~~~~ + +The block device to use. Most of the time, it is a partition of block device. +It's required for pstore/blk. + +It accepts the following variants: + +1. device number in hexadecimal represents itself; no + leading 0x, for example b302. +#. /dev/ represents the device number of disk +#. /dev/ represents the device number of partition - device + number of disk plus the partition number +#. /dev/p - same as the above; this form is used when disk + name of partitioned disk ends with a digit. +#. PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF represents the unique id of + a partition if the partition table provides it. The UUID may be either an + EFI/GPT UUID, or refer to an MSDOS partition using the format SSSSSSSS-PP, + where SSSSSSSS is a zero-filled hex representation of the 32-bit + "NT disk signature", and PP is a zero-filled hex representation of the + 1-based partition number. +#. PARTUUID=/PARTNROFF= to select a partition in relation to a + partition with a known unique id. +#. : major and minor number of the device separated by a colon. + +kmsg_size +~~~~~~~~~ + +The chunk size in KB for oops/panic front-end. It **MUST** be a multiple of 4. +It's optional if you do not care oops/panic log. + +There are multiple chunks for oops/panic front-end depending on the remaining +space except other pstore front-ends. + +pstore/blk will log to oops/panic chunks one by one, and always overwrite the +oldest chunk if there is no more free chunk. + +pmsg_size +~~~~~~~~~ + +The chunk size in KB for pmsg front-end. It **MUST** be a multiple of 4. +It's optional if you do not care pmsg log. + +Unlike oops/panic front-end, there is only one chunk for pmsg front-end. + +Pmsg is a user space accessible pstore object. Writes to */dev/pmsg0* are +appended to the chunk. On reboot the contents are available in +*/sys/fs/pstore/pmsg-pstore-blk-0*. + +console_size +~~~~~~~~~~~~ + +The chunk size in KB for console front-end. It **MUST** be a multiple of 4. +It's optional if you do not care console log. + +Similar to pmsg front-end, there is only one chunk for console front-end. + +All log of console will be appended to the chunk. On reboot the contents are +available in */sys/fs/pstore/console-pstore-blk-0*. + +ftrace_size +~~~~~~~~~~~ + +The chunk size in KB for ftrace front-end. It **MUST** be a multiple of 4. +It's optional if you do not care console log. + +Similar to oops front-end, there are multiple chunks for ftrace front-end +depending on the count of cpu processors. Each chunk size is equal to +ftrace_size / processors_count. + +All log of ftrace will be appended to the chunk. On reboot the contents are +combined and available in */sys/fs/pstore/ftrace-pstore-blk-0*. + +Persistent function tracing might be useful for debugging software or hardware +related hangs. Here is an example of usage:: + + # mount -t pstore pstore /sys/fs/pstore + # mount -t debugfs debugfs /sys/kernel/debug/ + # echo 1 > /sys/kernel/debug/pstore/record_ftrace + # reboot -f + [...] + # mount -t pstore pstore /sys/fs/pstore + # tail /sys/fs/pstore/ftrace-pstore-blk-0 + CPU:0 ts:5914676 c0063828 c0063b94 call_cpuidle <- cpu_startup_entry+0x1b8/0x1e0 + CPU:0 ts:5914678 c039ecdc c006385c cpuidle_enter_state <- call_cpuidle+0x44/0x48 + CPU:0 ts:5914680 c039e9a0 c039ecf0 cpuidle_enter_freeze <- cpuidle_enter_state+0x304/0x314 + CPU:0 ts:5914681 c0063870 c039ea30 sched_idle_set_state <- cpuidle_enter_state+0x44/0x314 + CPU:1 ts:5916720 c0160f59 c015ee04 kernfs_unmap_bin_file <- __kernfs_remove+0x140/0x204 + CPU:1 ts:5916721 c05ca625 c015ee0c __mutex_lock_slowpath <- __kernfs_remove+0x148/0x204 + CPU:1 ts:5916723 c05c813d c05ca630 yield_to <- __mutex_lock_slowpath+0x314/0x358 + CPU:1 ts:5916724 c05ca2d1 c05ca638 __ww_mutex_lock <- __mutex_lock_slowpath+0x31c/0x358 + +max_reason +~~~~~~~~~~ + +Limiting which kinds of kmsg dumps are stored can be controlled via +the ``max_reason`` value, as defined in include/linux/kmsg_dump.h's +``enum kmsg_dump_reason``. For example, to store both Oopses and Panics, +``max_reason`` should be set to 2 (KMSG_DUMP_OOPS), to store only Panics +``max_reason`` should be set to 1 (KMSG_DUMP_PANIC). Setting this to 0 +(KMSG_DUMP_UNDEF), means the reason filtering will be controlled by the +``printk.always_kmsg_dump`` boot param: if unset, it'll be KMSG_DUMP_OOPS, +otherwise KMSG_DUMP_MAX. + +Configurations for driver +------------------------- + +Only a block device driver cares about these configurations. A block device +driver uses ``register_pstore_blk`` to register to pstore/blk. + +.. kernel-doc:: fs/pstore/blk.c + :identifiers: register_pstore_blk + +Compression and header +---------------------- + +Block device is large enough for uncompressed oops data. Actually we do not +recommend data compression because pstore/blk will insert some information into +the first line of oops/panic data. For example:: + + Panic: Total 16 times + +It means that it's OOPS|Panic for the 16th time since the first booting. +Sometimes the number of occurrences of oops|panic since the first booting is +important to judge whether the system is stable. + +The following line is inserted by pstore filesystem. For example:: + + Oops#2 Part1 + +It means that it's OOPS for the 2nd time on the last boot. + +Reading the data +---------------- + +The dump data can be read from the pstore filesystem. The format for these +files is ``dmesg-pstore-blk-[N]`` for oops/panic front-end, +``pmsg-pstore-blk-0`` for pmsg front-end and so on. The timestamp of the +dump file records the trigger time. To delete a stored record from block +device, simply unlink the respective pstore file. + +Attentions in panic read/write APIs +----------------------------------- + +If on panic, the kernel is not going to run for much longer, the tasks will not +be scheduled and most kernel resources will be out of service. It +looks like a single-threaded program running on a single-core computer. + +The following points require special attention for panic read/write APIs: + +1. Can **NOT** allocate any memory. + If you need memory, just allocate while the block driver is initializing + rather than waiting until the panic. +#. Must be polled, **NOT** interrupt driven. + No task schedule any more. The block driver should delay to ensure the write + succeeds, but NOT sleep. +#. Can **NOT** take any lock. + There is no other task, nor any shared resource; you are safe to break all + locks. +#. Just use CPU to transfer. + Do not use DMA to transfer unless you are sure that DMA will not keep lock. +#. Control registers directly. + Please control registers directly rather than use Linux kernel resources. + Do I/O map while initializing rather than wait until a panic occurs. +#. Reset your block device and controller if necessary. + If you are not sure of the state of your block device and controller when + a panic occurs, you are safe to stop and reset them. + +pstore/blk supports psblk_blkdev_info(), which is defined in +*linux/pstore_blk.h*, to get information of using block device, such as the +device number, sector count and start sector of the whole disk. + +pstore block internals +---------------------- + +For developer reference, here are all the important structures and APIs: + +.. kernel-doc:: fs/pstore/zone.c + :internal: + +.. kernel-doc:: include/linux/pstore_zone.h + :internal: + +.. kernel-doc:: fs/pstore/blk.c + :export: + +.. kernel-doc:: include/linux/pstore_blk.h + :internal: diff --git a/MAINTAINERS b/MAINTAINERS index e64e5db31497..9c1f4feff418 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13660,6 +13660,7 @@ M: Tony Luck S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/pstore F: Documentation/admin-guide/ramoops.rst +F: Documentation/admin-guide/pstore-blk.rst F: Documentation/devicetree/bindings/reserved-memory/ramoops.txt F: drivers/acpi/apei/erst.c F: drivers/firmware/efi/efi-pstore.c diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index c2237984b407..e16a49ebfe54 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -171,6 +171,8 @@ config PSTORE_BLK This enables panic and oops message to be logged to a block dev where it can be read back at some later point. + For more information, see Documentation/admin-guide/pstore-blk.rst + If unsure, say N. config PSTORE_BLK_BLKDEV From 335426c6dcdd338d6b7c939c2da15fc9c5dd4959 Mon Sep 17 00:00:00 2001 From: WeiXiong Liao Date: Wed, 25 Mar 2020 16:55:03 +0800 Subject: [PATCH 39/43] pstore/zone: Provide way to skip "broken" zone for MTD devices One requirement to support MTD devices in pstore/zone is having a way to declare certain regions as broken. Add this support to pstore/zone. The MTD driver should return -ENOMSG when encountering a bad region, which tells pstore/zone to skip and try the next one. Signed-off-by: WeiXiong Liao Link: https://lore.kernel.org/lkml/20200511233229.27745-8-keescook@chromium.org/ Co-developed-by: Colin Ian King Signed-off-by: Colin Ian King Link: //lore.kernel.org/lkml/20200512173801.222666-1-colin.king@canonical.com Signed-off-by: Kees Cook --- fs/pstore/blk.c | 10 ++++-- fs/pstore/zone.c | 65 ++++++++++++++++++++++++++++++------- include/linux/pstore_blk.h | 3 +- include/linux/pstore_zone.h | 12 ++++--- 4 files changed, 71 insertions(+), 19 deletions(-) diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index e0d95fb48428..67343d6aa27a 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -109,9 +109,12 @@ struct bdev_info { * means error. * @write: The same as @read, but the following error number: * -EBUSY means try to write again later. + * -ENOMSG means to try next zone. * @panic_write:The write operation only used for panic case. It's optional - * if you do not care panic log. The parameters and return value - * are the same as @read. + * if you do not care panic log. The parameters are relative + * value to storage. + * On success, the number of bytes should be returned, others + * excluding -ENOMSG mean error. -ENOMSG means to try next zone. */ struct pstore_device_info { unsigned long total_size; @@ -337,6 +340,9 @@ static ssize_t psblk_blk_panic_write(const char *buf, size_t size, /* size and off must align to SECTOR_SIZE for block device */ ret = blkdev_panic_write(buf, off >> SECTOR_SHIFT, size >> SECTOR_SHIFT); + /* try next zone */ + if (ret == -ENOMSG) + return ret; return ret ? -EIO : size; } diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c index 2d9f452360bb..add26b125984 100644 --- a/fs/pstore/zone.c +++ b/fs/pstore/zone.c @@ -249,6 +249,9 @@ static int psz_zone_write(struct pstore_zone *zone, return 0; dirty: + /* no need to mark dirty if going to try next zone */ + if (wcnt == -ENOMSG) + return -ENOMSG; atomic_set(&zone->dirty, true); /* flush dirty zones nicely */ if (wcnt == -EBUSY && !is_on_panic()) @@ -391,7 +394,11 @@ static int psz_kmsg_recover_meta(struct psz_context *cxt) return -EINVAL; rcnt = info->read((char *)buf, len, zone->off); - if (rcnt != len) { + if (rcnt == -ENOMSG) { + pr_debug("%s with id %lu may be broken, skip\n", + zone->name, i); + continue; + } else if (rcnt != len) { pr_err("read %s with id %lu failed\n", zone->name, i); return (int)rcnt < 0 ? (int)rcnt : -EIO; } @@ -725,24 +732,58 @@ static void psz_write_kmsg_hdr(struct pstore_zone *zone, hdr->counter = 0; } +/* + * In case zone is broken, which may occur to MTD device, we try each zones, + * start at cxt->kmsg_write_cnt. + */ static inline int notrace psz_kmsg_write_record(struct psz_context *cxt, struct pstore_record *record) { size_t size, hlen; struct pstore_zone *zone; - unsigned int zonenum; + unsigned int i; - zonenum = cxt->kmsg_write_cnt; - zone = cxt->kpszs[zonenum]; - if (unlikely(!zone)) - return -ENOSPC; - cxt->kmsg_write_cnt = (zonenum + 1) % cxt->kmsg_max_cnt; + for (i = 0; i < cxt->kmsg_max_cnt; i++) { + unsigned int zonenum, len; + int ret; - pr_debug("write %s to zone id %d\n", zone->name, zonenum); - psz_write_kmsg_hdr(zone, record); - hlen = sizeof(struct psz_kmsg_header); - size = min_t(size_t, record->size, zone->buffer_size - hlen); - return psz_zone_write(zone, FLUSH_ALL, record->buf, size, hlen); + zonenum = (cxt->kmsg_write_cnt + i) % cxt->kmsg_max_cnt; + zone = cxt->kpszs[zonenum]; + if (unlikely(!zone)) + return -ENOSPC; + + /* avoid destroying old data, allocate a new one */ + len = zone->buffer_size + sizeof(*zone->buffer); + zone->oldbuf = zone->buffer; + zone->buffer = kzalloc(len, GFP_KERNEL); + if (!zone->buffer) { + zone->buffer = zone->oldbuf; + return -ENOMEM; + } + zone->buffer->sig = zone->oldbuf->sig; + + pr_debug("write %s to zone id %d\n", zone->name, zonenum); + psz_write_kmsg_hdr(zone, record); + hlen = sizeof(struct psz_kmsg_header); + size = min_t(size_t, record->size, zone->buffer_size - hlen); + ret = psz_zone_write(zone, FLUSH_ALL, record->buf, size, hlen); + if (likely(!ret || ret != -ENOMSG)) { + cxt->kmsg_write_cnt = zonenum + 1; + cxt->kmsg_write_cnt %= cxt->kmsg_max_cnt; + /* no need to try next zone, free last zone buffer */ + kfree(zone->oldbuf); + zone->oldbuf = NULL; + return ret; + } + + pr_debug("zone %u may be broken, try next dmesg zone\n", + zonenum); + kfree(zone->buffer); + zone->buffer = zone->oldbuf; + zone->oldbuf = NULL; + } + + return -EBUSY; } static int notrace psz_kmsg_write(struct psz_context *cxt, diff --git a/include/linux/pstore_blk.h b/include/linux/pstore_blk.h index 4501977b1336..ccba8c068752 100644 --- a/include/linux/pstore_blk.h +++ b/include/linux/pstore_blk.h @@ -14,7 +14,8 @@ * @start_sect: start sector to block device * @sects: sectors count on buf * - * Return: On success, zero should be returned. Others mean error. + * Return: On success, zero should be returned. Others excluding -ENOMSG + * mean error. -ENOMSG means to try next zone. * * Panic write to block device must be aligned to SECTOR_SIZE. */ diff --git a/include/linux/pstore_zone.h b/include/linux/pstore_zone.h index 6f16b0dd834a..e79a18e41064 100644 --- a/include/linux/pstore_zone.h +++ b/include/linux/pstore_zone.h @@ -23,11 +23,15 @@ typedef ssize_t (*pstore_zone_write_op)(const char *, size_t, loff_t); * @read: The general read operation. Both of the function parameters * @size and @offset are relative value to storage. * On success, the number of bytes should be returned, others - * means error. - * @write: The same as @read, but -EBUSY means try to write again later. + * mean error. + * @write: The same as @read, but the following error number: + * -EBUSY means try to write again later. + * -ENOMSG means to try next zone. * @panic_write:The write operation only used for panic case. It's optional - * if you do not care panic log. The parameters and return value - * are the same as @read. + * if you do not care panic log. The parameters are relative + * value to storage. + * On success, the number of bytes should be returned, others + * excluding -ENOMSG mean error. -ENOMSG means to try next zone. */ struct pstore_zone_info { struct module *owner; From 1525fb3bb6d69028b3941d34363397c28345ffab Mon Sep 17 00:00:00 2001 From: WeiXiong Liao Date: Wed, 25 Mar 2020 16:55:04 +0800 Subject: [PATCH 40/43] pstore/blk: Provide way to query pstore configuration In order to configure itself, the MTD backend needs to be able to query the current pstore configuration. Introduce pstore_blk_get_config() for this purpose. Signed-off-by: WeiXiong Liao Link: https://lore.kernel.org/lkml/20200511233229.27745-9-keescook@chromium.org/ Co-developed-by: Kees Cook Signed-off-by: Kees Cook --- fs/pstore/blk.c | 37 ++++++++++++++++++++++++++++++------- include/linux/pstore_blk.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 7 deletions(-) diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index 67343d6aa27a..631bc27c8661 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -94,6 +94,17 @@ struct bdev_info { sector_t start_sect; }; +#define check_size(name, alignsize) ({ \ + long _##name_ = (name); \ + _##name_ = _##name_ <= 0 ? 0 : (_##name_ * 1024); \ + if (_##name_ & ((alignsize) - 1)) { \ + pr_info(#name " must align to %d\n", \ + (alignsize)); \ + _##name_ = ALIGN(name, (alignsize)); \ + } \ + _##name_; \ +}) + /** * struct pstore_device_info - back-end pstore/blk driver structure. * @@ -149,13 +160,11 @@ static int psblk_register_do(struct pstore_device_info *dev) dev->flags = UINT_MAX; #define verify_size(name, alignsize, enabled) { \ - long _##name_ = (enabled) ? (name) : 0; \ - _##name_ = _##name_ <= 0 ? 0 : (_##name_ * 1024); \ - if (_##name_ & ((alignsize) - 1)) { \ - pr_info(#name " must align to %d\n", \ - (alignsize)); \ - _##name_ = ALIGN(name, (alignsize)); \ - } \ + long _##name_; \ + if (enabled) \ + _##name_ = check_size(name, alignsize); \ + else \ + _##name_ = 0; \ name = _##name_ / 1024; \ pstore_zone_info->name = _##name_; \ } @@ -453,6 +462,20 @@ void unregister_pstore_blk(unsigned int major) } EXPORT_SYMBOL_GPL(unregister_pstore_blk); +/* get information of pstore/blk */ +int pstore_blk_get_config(struct pstore_blk_config *info) +{ + strncpy(info->device, blkdev, 80); + info->max_reason = max_reason; + info->kmsg_size = check_size(kmsg_size, 4096); + info->pmsg_size = check_size(pmsg_size, 4096); + info->ftrace_size = check_size(ftrace_size, 4096); + info->console_size = check_size(console_size, 4096); + + return 0; +} +EXPORT_SYMBOL_GPL(pstore_blk_get_config); + static void __exit pstore_blk_exit(void) { mutex_lock(&pstore_blk_lock); diff --git a/include/linux/pstore_blk.h b/include/linux/pstore_blk.h index ccba8c068752..0c40774e71e0 100644 --- a/include/linux/pstore_blk.h +++ b/include/linux/pstore_blk.h @@ -49,4 +49,32 @@ struct pstore_blk_info { int register_pstore_blk(struct pstore_blk_info *info); void unregister_pstore_blk(unsigned int major); +/** + * struct pstore_blk_config - the pstore_blk backend configuration + * + * @device: Name of the desired block device + * @max_reason: Maximum kmsg dump reason to store to block device + * @kmsg_size: Total size of for kmsg dumps + * @pmsg_size: Total size of the pmsg storage area + * @console_size: Total size of the console storage area + * @ftrace_size: Total size for ftrace logging data (for all CPUs) + */ +struct pstore_blk_config { + char device[80]; + enum kmsg_dump_reason max_reason; + unsigned long kmsg_size; + unsigned long pmsg_size; + unsigned long console_size; + unsigned long ftrace_size; +}; + +/** + * pstore_blk_get_config - get a copy of the pstore_blk backend configuration + * + * @info: The sturct pstore_blk_config to be filled in + * + * Failure returns negative error code, and success returns 0. + */ +int pstore_blk_get_config(struct pstore_blk_config *info); + #endif From 7dcb7848ba110ff192efc917d1a6de66b4c9ca4f Mon Sep 17 00:00:00 2001 From: WeiXiong Liao Date: Wed, 25 Mar 2020 16:55:05 +0800 Subject: [PATCH 41/43] pstore/blk: Support non-block storage devices Add support for non-block devices (e.g. MTD). A non-block driver calls pstore_blk_register_device() to register iself. In addition, pstore/zone is updated to handle non-block devices, where an erase must be done before a write. Without this, there is no way to remove records stored to an MTD. Signed-off-by: WeiXiong Liao Link: https://lore.kernel.org/lkml/20200511233229.27745-10-keescook@chromium.org/ Co-developed-by: Kees Cook Signed-off-by: Kees Cook --- Documentation/admin-guide/pstore-blk.rst | 17 ++++- fs/pstore/blk.c | 95 +++++++++++++----------- fs/pstore/zone.c | 8 +- include/linux/pstore_blk.h | 38 ++++++++++ include/linux/pstore_zone.h | 6 ++ 5 files changed, 115 insertions(+), 49 deletions(-) diff --git a/Documentation/admin-guide/pstore-blk.rst b/Documentation/admin-guide/pstore-blk.rst index bef8c7436721..d45341e55e82 100644 --- a/Documentation/admin-guide/pstore-blk.rst +++ b/Documentation/admin-guide/pstore-blk.rst @@ -7,8 +7,8 @@ Introduction ------------ pstore block (pstore/blk) is an oops/panic logger that writes its logs to a -block device before the system crashes. You can get these log files by -mounting pstore filesystem like:: +block device and non-block device before the system crashes. You can get +these log files by mounting pstore filesystem like:: mount -t pstore pstore /sys/fs/pstore @@ -24,8 +24,8 @@ Configurations for user determine how pstore/blk works, such as pmsg_size, kmsg_size and so on. All of them support both Kconfig and module parameters, but module parameters have priority over Kconfig. -Configurations for driver are all about block device, such as total_size -of block device and read/write operations. +Configurations for driver are all about block device and non-block device, +such as total_size of block device and read/write operations. Configurations for user ----------------------- @@ -152,6 +152,15 @@ driver uses ``register_pstore_blk`` to register to pstore/blk. .. kernel-doc:: fs/pstore/blk.c :identifiers: register_pstore_blk +A non-block device driver uses ``register_pstore_device`` with +``struct pstore_device_info`` to register to pstore/blk. + +.. kernel-doc:: fs/pstore/blk.c + :identifiers: register_pstore_device + +.. kernel-doc:: include/linux/pstore_blk.h + :identifiers: pstore_device_info + Compression and header ---------------------- diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index 631bc27c8661..881b40ed8142 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -105,55 +105,22 @@ struct bdev_info { _##name_; \ }) -/** - * struct pstore_device_info - back-end pstore/blk driver structure. - * - * @total_size: The total size in bytes pstore/blk can use. It must be greater - * than 4096 and be multiple of 4096. - * @flags: Refer to macro starting with PSTORE_FLAGS defined in - * linux/pstore.h. It means what front-ends this device support. - * Zero means all backends for compatible. - * @read: The general read operation. Both of the function parameters - * @size and @offset are relative value to bock device (not the - * whole disk). - * On success, the number of bytes should be returned, others - * means error. - * @write: The same as @read, but the following error number: - * -EBUSY means try to write again later. - * -ENOMSG means to try next zone. - * @panic_write:The write operation only used for panic case. It's optional - * if you do not care panic log. The parameters are relative - * value to storage. - * On success, the number of bytes should be returned, others - * excluding -ENOMSG mean error. -ENOMSG means to try next zone. - */ -struct pstore_device_info { - unsigned long total_size; - unsigned int flags; - pstore_zone_read_op read; - pstore_zone_write_op write; - pstore_zone_write_op panic_write; -}; - -static int psblk_register_do(struct pstore_device_info *dev) +static int __register_pstore_device(struct pstore_device_info *dev) { int ret; + lockdep_assert_held(&pstore_blk_lock); + if (!dev || !dev->total_size || !dev->read || !dev->write) return -EINVAL; - mutex_lock(&pstore_blk_lock); - /* someone already registered before */ - if (pstore_zone_info) { - mutex_unlock(&pstore_blk_lock); + if (pstore_zone_info) return -EBUSY; - } + pstore_zone_info = kzalloc(sizeof(struct pstore_zone_info), GFP_KERNEL); - if (!pstore_zone_info) { - mutex_unlock(&pstore_blk_lock); + if (!pstore_zone_info) return -ENOMEM; - } /* zero means not limit on which backends to attempt to store. */ if (!dev->flags) @@ -179,6 +146,7 @@ static int psblk_register_do(struct pstore_device_info *dev) pstore_zone_info->max_reason = max_reason; pstore_zone_info->read = dev->read; pstore_zone_info->write = dev->write; + pstore_zone_info->erase = dev->erase; pstore_zone_info->panic_write = dev->panic_write; pstore_zone_info->name = KBUILD_MODNAME; pstore_zone_info->owner = THIS_MODULE; @@ -188,20 +156,51 @@ static int psblk_register_do(struct pstore_device_info *dev) kfree(pstore_zone_info); pstore_zone_info = NULL; } - mutex_unlock(&pstore_blk_lock); return ret; } - -static void psblk_unregister_do(struct pstore_device_info *dev) +/** + * register_pstore_device() - register non-block device to pstore/blk + * + * @dev: non-block device information + * + * Return: + * * 0 - OK + * * Others - something error. + */ +int register_pstore_device(struct pstore_device_info *dev) { + int ret; + mutex_lock(&pstore_blk_lock); + ret = __register_pstore_device(dev); + mutex_unlock(&pstore_blk_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(register_pstore_device); + +static void __unregister_pstore_device(struct pstore_device_info *dev) +{ + lockdep_assert_held(&pstore_blk_lock); if (pstore_zone_info && pstore_zone_info->read == dev->read) { unregister_pstore_zone(pstore_zone_info); kfree(pstore_zone_info); pstore_zone_info = NULL; } +} + +/** + * unregister_pstore_device() - unregister non-block device from pstore/blk + * + * @dev: non-block device information + */ +void unregister_pstore_device(struct pstore_device_info *dev) +{ + mutex_lock(&pstore_blk_lock); + __unregister_pstore_device(dev); mutex_unlock(&pstore_blk_lock); } +EXPORT_SYMBOL_GPL(unregister_pstore_device); /** * psblk_get_bdev() - open block device @@ -396,9 +395,10 @@ static int __register_pstore_blk(struct pstore_blk_info *info) dev.flags = info->flags; dev.read = psblk_generic_blk_read; dev.write = psblk_generic_blk_write; + dev.erase = NULL; dev.panic_write = info->panic_write ? psblk_blk_panic_write : NULL; - ret = psblk_register_do(&dev); + ret = __register_pstore_device(&dev); if (ret) goto err_put_bdev; @@ -442,7 +442,7 @@ static void __unregister_pstore_blk(unsigned int major) lockdep_assert_held(&pstore_blk_lock); if (psblk_bdev && MAJOR(psblk_bdev->bd_dev) == major) { - psblk_unregister_do(&dev); + __unregister_pstore_device(&dev); psblk_put_bdev(psblk_bdev, holder); blkdev_panic_write = NULL; psblk_bdev = NULL; @@ -481,6 +481,13 @@ static void __exit pstore_blk_exit(void) mutex_lock(&pstore_blk_lock); if (psblk_bdev) __unregister_pstore_blk(MAJOR(psblk_bdev->bd_dev)); + else { + struct pstore_device_info dev = { }; + + if (pstore_zone_info) + dev.read = pstore_zone_info->read; + __unregister_pstore_device(&dev); + } mutex_unlock(&pstore_blk_lock); } module_exit(pstore_blk_exit); diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c index add26b125984..819428dfa32f 100644 --- a/fs/pstore/zone.c +++ b/fs/pstore/zone.c @@ -660,15 +660,21 @@ static inline int psz_kmsg_erase(struct psz_context *cxt, struct psz_buffer *buffer = zone->buffer; struct psz_kmsg_header *hdr = (struct psz_kmsg_header *)buffer->data; + size_t size; if (unlikely(!psz_ok(zone))) return 0; + /* this zone is already updated, no need to erase */ if (record->count != hdr->counter) return 0; + size = buffer_datalen(zone) + sizeof(*zone->buffer); atomic_set(&zone->buffer->datalen, 0); - return psz_zone_write(zone, FLUSH_META, NULL, 0, 0); + if (cxt->pstore_zone_info->erase) + return cxt->pstore_zone_info->erase(size, zone->off); + else + return psz_zone_write(zone, FLUSH_META, NULL, 0, 0); } static inline int psz_record_erase(struct psz_context *cxt, diff --git a/include/linux/pstore_blk.h b/include/linux/pstore_blk.h index 0c40774e71e0..61e914522b01 100644 --- a/include/linux/pstore_blk.h +++ b/include/linux/pstore_blk.h @@ -49,6 +49,44 @@ struct pstore_blk_info { int register_pstore_blk(struct pstore_blk_info *info); void unregister_pstore_blk(unsigned int major); +/** + * struct pstore_device_info - back-end pstore/blk driver structure. + * + * @total_size: The total size in bytes pstore/blk can use. It must be greater + * than 4096 and be multiple of 4096. + * @flags: Refer to macro starting with PSTORE_FLAGS defined in + * linux/pstore.h. It means what front-ends this device support. + * Zero means all backends for compatible. + * @read: The general read operation. Both of the function parameters + * @size and @offset are relative value to bock device (not the + * whole disk). + * On success, the number of bytes should be returned, others + * means error. + * @write: The same as @read, but the following error number: + * -EBUSY means try to write again later. + * -ENOMSG means to try next zone. + * @erase: The general erase operation for device with special removing + * job. Both of the function parameters @size and @offset are + * relative value to storage. + * Return 0 on success and others on failure. + * @panic_write:The write operation only used for panic case. It's optional + * if you do not care panic log. The parameters are relative + * value to storage. + * On success, the number of bytes should be returned, others + * excluding -ENOMSG mean error. -ENOMSG means to try next zone. + */ +struct pstore_device_info { + unsigned long total_size; + unsigned int flags; + pstore_zone_read_op read; + pstore_zone_write_op write; + pstore_zone_erase_op erase; + pstore_zone_write_op panic_write; +}; + +int register_pstore_device(struct pstore_device_info *dev); +void unregister_pstore_device(struct pstore_device_info *dev); + /** * struct pstore_blk_config - the pstore_blk backend configuration * diff --git a/include/linux/pstore_zone.h b/include/linux/pstore_zone.h index e79a18e41064..1e35eaa33e5e 100644 --- a/include/linux/pstore_zone.h +++ b/include/linux/pstore_zone.h @@ -7,6 +7,7 @@ typedef ssize_t (*pstore_zone_read_op)(char *, size_t, loff_t); typedef ssize_t (*pstore_zone_write_op)(const char *, size_t, loff_t); +typedef ssize_t (*pstore_zone_erase_op)(size_t, loff_t); /** * struct pstore_zone_info - pstore/zone back-end driver structure * @@ -27,6 +28,10 @@ typedef ssize_t (*pstore_zone_write_op)(const char *, size_t, loff_t); * @write: The same as @read, but the following error number: * -EBUSY means try to write again later. * -ENOMSG means to try next zone. + * @erase: The general erase operation for device with special removing + * job. Both of the function parameters @size and @offset are + * relative value to storage. + * Return 0 on success and others on failure. * @panic_write:The write operation only used for panic case. It's optional * if you do not care panic log. The parameters are relative * value to storage. @@ -45,6 +50,7 @@ struct pstore_zone_info { unsigned long ftrace_size; pstore_zone_read_op read; pstore_zone_write_op write; + pstore_zone_erase_op erase; pstore_zone_write_op panic_write; }; From f8feafeaeedbf0a324c373c5fa29a2098a69c458 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 8 May 2020 08:34:01 -0700 Subject: [PATCH 42/43] pstore/blk: Introduce "best_effort" mode In order to use arbitrary block devices as a pstore backend, provide a new module param named "best_effort", which will allow using any block device, even if it has not provided a panic_write callback. Link: https://lore.kernel.org/lkml/20200511233229.27745-12-keescook@chromium.org/ Signed-off-by: Kees Cook --- fs/pstore/blk.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index 881b40ed8142..fcd5563dde06 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -51,6 +51,10 @@ static long ftrace_size = -1; module_param(ftrace_size, long, 0400); MODULE_PARM_DESC(ftrace_size, "ftrace size in kbytes"); +static bool best_effort; +module_param(best_effort, bool, 0400); +MODULE_PARM_DESC(best_effort, "use best effort to write (i.e. do not require storage driver pstore support, default: off)"); + /* * blkdev - the block device to use for pstore storage * @@ -374,7 +378,8 @@ static int __register_pstore_blk(struct pstore_blk_info *info) } /* only allow driver matching the @blkdev */ - if (!binfo.devt || MAJOR(binfo.devt) != info->major) { + if (!binfo.devt || (!best_effort && + MAJOR(binfo.devt) != info->major)) { pr_debug("invalid major %u (expect %u)\n", info->major, MAJOR(binfo.devt)); ret = -ENODEV; @@ -476,6 +481,20 @@ int pstore_blk_get_config(struct pstore_blk_config *info) } EXPORT_SYMBOL_GPL(pstore_blk_get_config); +static int __init pstore_blk_init(void) +{ + struct pstore_blk_info info = { }; + int ret = 0; + + mutex_lock(&pstore_blk_lock); + if (!pstore_zone_info && best_effort && blkdev[0]) + ret = __register_pstore_blk(&info); + mutex_unlock(&pstore_blk_lock); + + return ret; +} +late_initcall(pstore_blk_init); + static void __exit pstore_blk_exit(void) { mutex_lock(&pstore_blk_lock); From 78c08247b9d3e03192f8b359aa079024e805a948 Mon Sep 17 00:00:00 2001 From: WeiXiong Liao Date: Wed, 25 Mar 2020 16:55:06 +0800 Subject: [PATCH 43/43] mtd: Support kmsg dumper based on pstore/blk This introduces mtdpstore, which is similar to mtdoops but more powerful. It uses pstore/blk, and aims to store panic and oops logs to a flash partition, where pstore can later read back and present as files in the mounted pstore filesystem. To make mtdpstore work, the "blkdev" of pstore/blk should be set as MTD device name or MTD device number. For more details, see Documentation/admin-guide/pstore-blk.rst This solves a number of issues: - Work duplication: both of pstore and mtdoops do the same job storing panic/oops log. They have very similar logic, registering to kmsg dumper and storing logs to several chunks one by one. - Layer violations: drivers should provides methods instead of polices. MTD should provide read/write/erase operations, and allow a higher level drivers to provide the chunk management, kmsg dump configuration, etc. - Missing features: pstore provides many additional features, including presenting the logs as files, logging dump time and count, and supporting other frontends like pmsg, console, etc. Signed-off-by: WeiXiong Liao Link: https://lore.kernel.org/lkml/20200511233229.27745-11-keescook@chromium.org/ Link: https://lore.kernel.org/r/1589266715-4168-1-git-send-email-liaoweixiong@allwinnertech.com Signed-off-by: Kees Cook --- Documentation/admin-guide/pstore-blk.rst | 9 +- drivers/mtd/Kconfig | 10 + drivers/mtd/Makefile | 1 + drivers/mtd/mtdpstore.c | 578 +++++++++++++++++++++++ 4 files changed, 596 insertions(+), 2 deletions(-) create mode 100644 drivers/mtd/mtdpstore.c diff --git a/Documentation/admin-guide/pstore-blk.rst b/Documentation/admin-guide/pstore-blk.rst index d45341e55e82..296d5027787a 100644 --- a/Documentation/admin-guide/pstore-blk.rst +++ b/Documentation/admin-guide/pstore-blk.rst @@ -43,9 +43,9 @@ blkdev ~~~~~~ The block device to use. Most of the time, it is a partition of block device. -It's required for pstore/blk. +It's required for pstore/blk. It is also used for MTD device. -It accepts the following variants: +It accepts the following variants for block device: 1. device number in hexadecimal represents itself; no leading 0x, for example b302. @@ -64,6 +64,11 @@ It accepts the following variants: partition with a known unique id. #. : major and minor number of the device separated by a colon. +It accepts the following variants for MTD device: + +1. MTD device name. "pstore" is recommended. +#. MTD device number. + kmsg_size ~~~~~~~~~ diff --git a/drivers/mtd/Kconfig b/drivers/mtd/Kconfig index 42d401ea60ee..6ddab796216d 100644 --- a/drivers/mtd/Kconfig +++ b/drivers/mtd/Kconfig @@ -170,6 +170,16 @@ config MTD_OOPS buffer in a flash partition where it can be read back at some later point. +config MTD_PSTORE + tristate "Log panic/oops to an MTD buffer based on pstore" + depends on PSTORE_BLK + help + This enables panic and oops messages to be logged to a circular + buffer in a flash partition where it can be read back as files after + mounting pstore filesystem. + + If unsure, say N. + config MTD_SWAP tristate "Swap on MTD device support" depends on MTD && SWAP diff --git a/drivers/mtd/Makefile b/drivers/mtd/Makefile index 56cc60ccc477..593d0593a038 100644 --- a/drivers/mtd/Makefile +++ b/drivers/mtd/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_RFD_FTL) += rfd_ftl.o obj-$(CONFIG_SSFDC) += ssfdc.o obj-$(CONFIG_SM_FTL) += sm_ftl.o obj-$(CONFIG_MTD_OOPS) += mtdoops.o +obj-$(CONFIG_MTD_PSTORE) += mtdpstore.o obj-$(CONFIG_MTD_SWAP) += mtdswap.o nftl-objs := nftlcore.o nftlmount.o diff --git a/drivers/mtd/mtdpstore.c b/drivers/mtd/mtdpstore.c new file mode 100644 index 000000000000..a4fe6060b960 --- /dev/null +++ b/drivers/mtd/mtdpstore.c @@ -0,0 +1,578 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define dev_fmt(fmt) "mtdoops-pstore: " fmt + +#include +#include +#include +#include +#include + +static struct mtdpstore_context { + int index; + struct pstore_blk_config info; + struct pstore_device_info dev; + struct mtd_info *mtd; + unsigned long *rmmap; /* removed bit map */ + unsigned long *usedmap; /* used bit map */ + /* + * used for panic write + * As there are no block_isbad for panic case, we should keep this + * status before panic to ensure panic_write not failed. + */ + unsigned long *badmap; /* bad block bit map */ +} oops_cxt; + +static int mtdpstore_block_isbad(struct mtdpstore_context *cxt, loff_t off) +{ + int ret; + struct mtd_info *mtd = cxt->mtd; + u64 blknum; + + off = ALIGN_DOWN(off, mtd->erasesize); + blknum = div_u64(off, mtd->erasesize); + + if (test_bit(blknum, cxt->badmap)) + return true; + ret = mtd_block_isbad(mtd, off); + if (ret < 0) { + dev_err(&mtd->dev, "mtd_block_isbad failed, aborting\n"); + return ret; + } else if (ret > 0) { + set_bit(blknum, cxt->badmap); + return true; + } + return false; +} + +static inline int mtdpstore_panic_block_isbad(struct mtdpstore_context *cxt, + loff_t off) +{ + struct mtd_info *mtd = cxt->mtd; + u64 blknum; + + off = ALIGN_DOWN(off, mtd->erasesize); + blknum = div_u64(off, mtd->erasesize); + return test_bit(blknum, cxt->badmap); +} + +static inline void mtdpstore_mark_used(struct mtdpstore_context *cxt, + loff_t off) +{ + struct mtd_info *mtd = cxt->mtd; + u64 zonenum = div_u64(off, cxt->info.kmsg_size); + + dev_dbg(&mtd->dev, "mark zone %llu used\n", zonenum); + set_bit(zonenum, cxt->usedmap); +} + +static inline void mtdpstore_mark_unused(struct mtdpstore_context *cxt, + loff_t off) +{ + struct mtd_info *mtd = cxt->mtd; + u64 zonenum = div_u64(off, cxt->info.kmsg_size); + + dev_dbg(&mtd->dev, "mark zone %llu unused\n", zonenum); + clear_bit(zonenum, cxt->usedmap); +} + +static inline void mtdpstore_block_mark_unused(struct mtdpstore_context *cxt, + loff_t off) +{ + struct mtd_info *mtd = cxt->mtd; + u32 zonecnt = mtd->erasesize / cxt->info.kmsg_size; + u64 zonenum; + + off = ALIGN_DOWN(off, mtd->erasesize); + zonenum = div_u64(off, cxt->info.kmsg_size); + while (zonecnt > 0) { + dev_dbg(&mtd->dev, "mark zone %llu unused\n", zonenum); + clear_bit(zonenum, cxt->usedmap); + zonenum++; + zonecnt--; + } +} + +static inline int mtdpstore_is_used(struct mtdpstore_context *cxt, loff_t off) +{ + u64 zonenum = div_u64(off, cxt->info.kmsg_size); + u64 blknum = div_u64(off, cxt->mtd->erasesize); + + if (test_bit(blknum, cxt->badmap)) + return true; + return test_bit(zonenum, cxt->usedmap); +} + +static int mtdpstore_block_is_used(struct mtdpstore_context *cxt, + loff_t off) +{ + struct mtd_info *mtd = cxt->mtd; + u32 zonecnt = mtd->erasesize / cxt->info.kmsg_size; + u64 zonenum; + + off = ALIGN_DOWN(off, mtd->erasesize); + zonenum = div_u64(off, cxt->info.kmsg_size); + while (zonecnt > 0) { + if (test_bit(zonenum, cxt->usedmap)) + return true; + zonenum++; + zonecnt--; + } + return false; +} + +static int mtdpstore_is_empty(struct mtdpstore_context *cxt, char *buf, + size_t size) +{ + struct mtd_info *mtd = cxt->mtd; + size_t sz; + int i; + + sz = min_t(uint32_t, size, mtd->writesize / 4); + for (i = 0; i < sz; i++) { + if (buf[i] != (char)0xFF) + return false; + } + return true; +} + +static void mtdpstore_mark_removed(struct mtdpstore_context *cxt, loff_t off) +{ + struct mtd_info *mtd = cxt->mtd; + u64 zonenum = div_u64(off, cxt->info.kmsg_size); + + dev_dbg(&mtd->dev, "mark zone %llu removed\n", zonenum); + set_bit(zonenum, cxt->rmmap); +} + +static void mtdpstore_block_clear_removed(struct mtdpstore_context *cxt, + loff_t off) +{ + struct mtd_info *mtd = cxt->mtd; + u32 zonecnt = mtd->erasesize / cxt->info.kmsg_size; + u64 zonenum; + + off = ALIGN_DOWN(off, mtd->erasesize); + zonenum = div_u64(off, cxt->info.kmsg_size); + while (zonecnt > 0) { + clear_bit(zonenum, cxt->rmmap); + zonenum++; + zonecnt--; + } +} + +static int mtdpstore_block_is_removed(struct mtdpstore_context *cxt, + loff_t off) +{ + struct mtd_info *mtd = cxt->mtd; + u32 zonecnt = mtd->erasesize / cxt->info.kmsg_size; + u64 zonenum; + + off = ALIGN_DOWN(off, mtd->erasesize); + zonenum = div_u64(off, cxt->info.kmsg_size); + while (zonecnt > 0) { + if (test_bit(zonenum, cxt->rmmap)) + return true; + zonenum++; + zonecnt--; + } + return false; +} + +static int mtdpstore_erase_do(struct mtdpstore_context *cxt, loff_t off) +{ + struct mtd_info *mtd = cxt->mtd; + struct erase_info erase; + int ret; + + off = ALIGN_DOWN(off, cxt->mtd->erasesize); + dev_dbg(&mtd->dev, "try to erase off 0x%llx\n", off); + erase.len = cxt->mtd->erasesize; + erase.addr = off; + ret = mtd_erase(cxt->mtd, &erase); + if (!ret) + mtdpstore_block_clear_removed(cxt, off); + else + dev_err(&mtd->dev, "erase of region [0x%llx, 0x%llx] on \"%s\" failed\n", + (unsigned long long)erase.addr, + (unsigned long long)erase.len, cxt->info.device); + return ret; +} + +/* + * called while removing file + * + * Avoiding over erasing, do erase block only when the whole block is unused. + * If the block contains valid log, do erase lazily on flush_removed() when + * unregister. + */ +static ssize_t mtdpstore_erase(size_t size, loff_t off) +{ + struct mtdpstore_context *cxt = &oops_cxt; + + if (mtdpstore_block_isbad(cxt, off)) + return -EIO; + + mtdpstore_mark_unused(cxt, off); + + /* If the block still has valid data, mtdpstore do erase lazily */ + if (likely(mtdpstore_block_is_used(cxt, off))) { + mtdpstore_mark_removed(cxt, off); + return 0; + } + + /* all zones are unused, erase it */ + return mtdpstore_erase_do(cxt, off); +} + +/* + * What is security for mtdpstore? + * As there is no erase for panic case, we should ensure at least one zone + * is writable. Otherwise, panic write will fail. + * If zone is used, write operation will return -ENOMSG, which means that + * pstore/blk will try one by one until gets an empty zone. So, it is not + * needed to ensure the next zone is empty, but at least one. + */ +static int mtdpstore_security(struct mtdpstore_context *cxt, loff_t off) +{ + int ret = 0, i; + struct mtd_info *mtd = cxt->mtd; + u32 zonenum = (u32)div_u64(off, cxt->info.kmsg_size); + u32 zonecnt = (u32)div_u64(cxt->mtd->size, cxt->info.kmsg_size); + u32 blkcnt = (u32)div_u64(cxt->mtd->size, cxt->mtd->erasesize); + u32 erasesize = cxt->mtd->erasesize; + + for (i = 0; i < zonecnt; i++) { + u32 num = (zonenum + i) % zonecnt; + + /* found empty zone */ + if (!test_bit(num, cxt->usedmap)) + return 0; + } + + /* If there is no any empty zone, we have no way but to do erase */ + while (blkcnt--) { + div64_u64_rem(off + erasesize, cxt->mtd->size, (u64 *)&off); + + if (mtdpstore_block_isbad(cxt, off)) + continue; + + ret = mtdpstore_erase_do(cxt, off); + if (!ret) { + mtdpstore_block_mark_unused(cxt, off); + break; + } + } + + if (ret) + dev_err(&mtd->dev, "all blocks bad!\n"); + dev_dbg(&mtd->dev, "end security\n"); + return ret; +} + +static ssize_t mtdpstore_write(const char *buf, size_t size, loff_t off) +{ + struct mtdpstore_context *cxt = &oops_cxt; + struct mtd_info *mtd = cxt->mtd; + size_t retlen; + int ret; + + if (mtdpstore_block_isbad(cxt, off)) + return -ENOMSG; + + /* zone is used, please try next one */ + if (mtdpstore_is_used(cxt, off)) + return -ENOMSG; + + dev_dbg(&mtd->dev, "try to write off 0x%llx size %zu\n", off, size); + ret = mtd_write(cxt->mtd, off, size, &retlen, (u_char *)buf); + if (ret < 0 || retlen != size) { + dev_err(&mtd->dev, "write failure at %lld (%zu of %zu written), err %d\n", + off, retlen, size, ret); + return -EIO; + } + mtdpstore_mark_used(cxt, off); + + mtdpstore_security(cxt, off); + return retlen; +} + +static inline bool mtdpstore_is_io_error(int ret) +{ + return ret < 0 && !mtd_is_bitflip(ret) && !mtd_is_eccerr(ret); +} + +/* + * All zones will be read as pstore/blk will read zone one by one when do + * recover. + */ +static ssize_t mtdpstore_read(char *buf, size_t size, loff_t off) +{ + struct mtdpstore_context *cxt = &oops_cxt; + struct mtd_info *mtd = cxt->mtd; + size_t retlen, done; + int ret; + + if (mtdpstore_block_isbad(cxt, off)) + return -ENOMSG; + + dev_dbg(&mtd->dev, "try to read off 0x%llx size %zu\n", off, size); + for (done = 0, retlen = 0; done < size; done += retlen) { + retlen = 0; + + ret = mtd_read(cxt->mtd, off + done, size - done, &retlen, + (u_char *)buf + done); + if (mtdpstore_is_io_error(ret)) { + dev_err(&mtd->dev, "read failure at %lld (%zu of %zu read), err %d\n", + off + done, retlen, size - done, ret); + /* the zone may be broken, try next one */ + return -ENOMSG; + } + + /* + * ECC error. The impact on log data is so small. Maybe we can + * still read it and try to understand. So mtdpstore just hands + * over what it gets and user can judge whether the data is + * valid or not. + */ + if (mtd_is_eccerr(ret)) { + dev_err(&mtd->dev, "ecc error at %lld (%zu of %zu read), err %d\n", + off + done, retlen, size - done, ret); + /* driver may not set retlen when ecc error */ + retlen = retlen == 0 ? size - done : retlen; + } + } + + if (mtdpstore_is_empty(cxt, buf, size)) + mtdpstore_mark_unused(cxt, off); + else + mtdpstore_mark_used(cxt, off); + + mtdpstore_security(cxt, off); + return retlen; +} + +static ssize_t mtdpstore_panic_write(const char *buf, size_t size, loff_t off) +{ + struct mtdpstore_context *cxt = &oops_cxt; + struct mtd_info *mtd = cxt->mtd; + size_t retlen; + int ret; + + if (mtdpstore_panic_block_isbad(cxt, off)) + return -ENOMSG; + + /* zone is used, please try next one */ + if (mtdpstore_is_used(cxt, off)) + return -ENOMSG; + + ret = mtd_panic_write(cxt->mtd, off, size, &retlen, (u_char *)buf); + if (ret < 0 || size != retlen) { + dev_err(&mtd->dev, "panic write failure at %lld (%zu of %zu read), err %d\n", + off, retlen, size, ret); + return -EIO; + } + mtdpstore_mark_used(cxt, off); + + return retlen; +} + +static void mtdpstore_notify_add(struct mtd_info *mtd) +{ + int ret; + struct mtdpstore_context *cxt = &oops_cxt; + struct pstore_blk_config *info = &cxt->info; + unsigned long longcnt; + + if (!strcmp(mtd->name, info->device)) + cxt->index = mtd->index; + + if (mtd->index != cxt->index || cxt->index < 0) + return; + + dev_dbg(&mtd->dev, "found matching MTD device %s\n", mtd->name); + + if (mtd->size < info->kmsg_size * 2) { + dev_err(&mtd->dev, "MTD partition %d not big enough\n", + mtd->index); + return; + } + /* + * kmsg_size must be aligned to 4096 Bytes, which is limited by + * psblk. The default value of kmsg_size is 64KB. If kmsg_size + * is larger than erasesize, some errors will occur since mtdpsotre + * is designed on it. + */ + if (mtd->erasesize < info->kmsg_size) { + dev_err(&mtd->dev, "eraseblock size of MTD partition %d too small\n", + mtd->index); + return; + } + if (unlikely(info->kmsg_size % mtd->writesize)) { + dev_err(&mtd->dev, "record size %lu KB must align to write size %d KB\n", + info->kmsg_size / 1024, + mtd->writesize / 1024); + return; + } + + longcnt = BITS_TO_LONGS(div_u64(mtd->size, info->kmsg_size)); + cxt->rmmap = kcalloc(longcnt, sizeof(long), GFP_KERNEL); + cxt->usedmap = kcalloc(longcnt, sizeof(long), GFP_KERNEL); + + longcnt = BITS_TO_LONGS(div_u64(mtd->size, mtd->erasesize)); + cxt->badmap = kcalloc(longcnt, sizeof(long), GFP_KERNEL); + + cxt->dev.total_size = mtd->size; + /* just support dmesg right now */ + cxt->dev.flags = PSTORE_FLAGS_DMESG; + cxt->dev.read = mtdpstore_read; + cxt->dev.write = mtdpstore_write; + cxt->dev.erase = mtdpstore_erase; + cxt->dev.panic_write = mtdpstore_panic_write; + + ret = register_pstore_device(&cxt->dev); + if (ret) { + dev_err(&mtd->dev, "mtd%d register to psblk failed\n", + mtd->index); + return; + } + cxt->mtd = mtd; + dev_info(&mtd->dev, "Attached to MTD device %d\n", mtd->index); +} + +static int mtdpstore_flush_removed_do(struct mtdpstore_context *cxt, + loff_t off, size_t size) +{ + struct mtd_info *mtd = cxt->mtd; + u_char *buf; + int ret; + size_t retlen; + struct erase_info erase; + + buf = kmalloc(mtd->erasesize, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* 1st. read to cache */ + ret = mtd_read(mtd, off, mtd->erasesize, &retlen, buf); + if (mtdpstore_is_io_error(ret)) + goto free; + + /* 2nd. erase block */ + erase.len = mtd->erasesize; + erase.addr = off; + ret = mtd_erase(mtd, &erase); + if (ret) + goto free; + + /* 3rd. write back */ + while (size) { + unsigned int zonesize = cxt->info.kmsg_size; + + /* there is valid data on block, write back */ + if (mtdpstore_is_used(cxt, off)) { + ret = mtd_write(mtd, off, zonesize, &retlen, buf); + if (ret) + dev_err(&mtd->dev, "write failure at %lld (%zu of %u written), err %d\n", + off, retlen, zonesize, ret); + } + + off += zonesize; + size -= min_t(unsigned int, zonesize, size); + } + +free: + kfree(buf); + return ret; +} + +/* + * What does mtdpstore_flush_removed() do? + * When user remove any log file on pstore filesystem, mtdpstore should do + * something to ensure log file removed. If the whole block is no longer used, + * it's nice to erase the block. However if the block still contains valid log, + * what mtdpstore can do is to erase and write the valid log back. + */ +static int mtdpstore_flush_removed(struct mtdpstore_context *cxt) +{ + struct mtd_info *mtd = cxt->mtd; + int ret; + loff_t off; + u32 blkcnt = (u32)div_u64(mtd->size, mtd->erasesize); + + for (off = 0; blkcnt > 0; blkcnt--, off += mtd->erasesize) { + ret = mtdpstore_block_isbad(cxt, off); + if (ret) + continue; + + ret = mtdpstore_block_is_removed(cxt, off); + if (!ret) + continue; + + ret = mtdpstore_flush_removed_do(cxt, off, mtd->erasesize); + if (ret) + return ret; + } + return 0; +} + +static void mtdpstore_notify_remove(struct mtd_info *mtd) +{ + struct mtdpstore_context *cxt = &oops_cxt; + + if (mtd->index != cxt->index || cxt->index < 0) + return; + + mtdpstore_flush_removed(cxt); + + unregister_pstore_device(&cxt->dev); + kfree(cxt->badmap); + kfree(cxt->usedmap); + kfree(cxt->rmmap); + cxt->mtd = NULL; + cxt->index = -1; +} + +static struct mtd_notifier mtdpstore_notifier = { + .add = mtdpstore_notify_add, + .remove = mtdpstore_notify_remove, +}; + +static int __init mtdpstore_init(void) +{ + int ret; + struct mtdpstore_context *cxt = &oops_cxt; + struct pstore_blk_config *info = &cxt->info; + + ret = pstore_blk_get_config(info); + if (unlikely(ret)) + return ret; + + if (strlen(info->device) == 0) { + pr_err("mtd device must be supplied (device name is empty)\n"); + return -EINVAL; + } + if (!info->kmsg_size) { + pr_err("no backend enabled (kmsg_size is 0)\n"); + return -EINVAL; + } + + /* Setup the MTD device to use */ + ret = kstrtoint((char *)info->device, 0, &cxt->index); + if (ret) + cxt->index = -1; + + register_mtd_user(&mtdpstore_notifier); + return 0; +} +module_init(mtdpstore_init); + +static void __exit mtdpstore_exit(void) +{ + unregister_mtd_user(&mtdpstore_notifier); +} +module_exit(mtdpstore_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("WeiXiong Liao "); +MODULE_DESCRIPTION("MTD backend for pstore/blk");