diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index f627e705e663..b268e3e18b4a 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -512,3 +512,19 @@ Date: July 2021 Contact: "Daeho Jeong" Description: You can control the multiplier value of bdi device readahead window size between 2 (default) and 256 for POSIX_FADV_SEQUENTIAL advise option. + +What: /sys/fs/f2fs//max_fragment_chunk +Date: August 2021 +Contact: "Daeho Jeong" +Description: With "mode=fragment:block" mount options, we can scatter block allocation. + f2fs will allocate 1.. blocks in a chunk and make a hole + in the length of 1.. by turns. This value can be set + between 1..512 and the default value is 4. + +What: /sys/fs/f2fs//max_fragment_hole +Date: August 2021 +Contact: "Daeho Jeong" +Description: With "mode=fragment:block" mount options, we can scatter block allocation. + f2fs will allocate 1.. blocks in a chunk and make a hole + in the length of 1.. by turns. This value can be set + between 1..512 and the default value is 4. diff --git a/Documentation/block/inline-encryption.rst b/Documentation/block/inline-encryption.rst index 7f9b40d6b416..2e2a2c627096 100644 --- a/Documentation/block/inline-encryption.rst +++ b/Documentation/block/inline-encryption.rst @@ -1,5 +1,7 @@ .. SPDX-License-Identifier: GPL-2.0 +.. _inline_encryption: + ================= Inline Encryption ================= diff --git a/Documentation/filesystems/erofs.rst b/Documentation/filesystems/erofs.rst index b97579b7d8fb..01df283c7d04 100644 --- a/Documentation/filesystems/erofs.rst +++ b/Documentation/filesystems/erofs.rst @@ -19,9 +19,10 @@ It is designed as a better filesystem solution for the following scenarios: immutable and bit-for-bit identical to the official golden image for their releases due to security and other considerations and - - hope to save some extra storage space with guaranteed end-to-end performance - by using reduced metadata and transparent file compression, especially - for those embedded devices with limited memory (ex, smartphone); + - hope to minimize extra storage space with guaranteed end-to-end performance + by using compact layout, transparent file compression and direct access, + especially for those embedded devices with limited memory and high-density + hosts with numerous containers; Here is the main features of EROFS: @@ -51,7 +52,9 @@ Here is the main features of EROFS: - Support POSIX.1e ACLs by using xattrs; - Support transparent data compression as an option: - LZ4 algorithm with the fixed-sized output compression for high performance. + LZ4 algorithm with the fixed-sized output compression for high performance; + + - Multiple device support for multi-layer container images. The following git tree provides the file system user-space tools under development (ex, formatting tool mkfs.erofs): @@ -87,6 +90,7 @@ cache_strategy=%s Select a strategy for cached decompression from now on: dax={always,never} Use direct access (no page cache). See Documentation/filesystems/dax.rst. dax A legacy option which is an alias for ``dax=always``. +device=%s Specify a path to an extra device to be used together. =================== ========================================================= On-disk details diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 09de6ebbbdfa..6954c04753ad 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -197,10 +197,29 @@ fault_type=%d Support configuring fault injection type, should be FAULT_DISCARD 0x000002000 FAULT_WRITE_IO 0x000004000 FAULT_SLAB_ALLOC 0x000008000 + FAULT_DQUOT_INIT 0x000010000 =================== =========== mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random writes towards main area. + "fragment:segment" and "fragment:block" are newly added here. + These are developer options for experiments to simulate filesystem + fragmentation/after-GC situation itself. The developers use these + modes to understand filesystem fragmentation/after-GC condition well, + and eventually get some insights to handle them better. + In "fragment:segment", f2fs allocates a new segment in ramdom + position. With this, we can simulate the after-GC condition. + In "fragment:block", we can scatter block allocation with + "max_fragment_chunk" and "max_fragment_hole" sysfs nodes. + We added some randomness to both chunk and hole size to make + it close to realistic IO pattern. So, in this mode, f2fs will allocate + 1.. blocks in a chunk and make a hole in the + length of 1.. by turns. With this, the newly + allocated blocks will be scattered throughout the whole partition. + Note that "fragment:block" implicitly enables "fragment:segment" + option for more randomness. + Please, use these options for your experiments and we strongly + recommend to re-format the filesystem after using these options. io_bits=%u Set the bit size of write IO requests. It should be set with "mode=lfs". usrquota Enable plain user disk quota accounting. diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst index 7940a45d3952..4d5d50dca65c 100644 --- a/Documentation/filesystems/fscrypt.rst +++ b/Documentation/filesystems/fscrypt.rst @@ -77,11 +77,11 @@ Side-channel attacks fscrypt is only resistant to side-channel attacks, such as timing or electromagnetic attacks, to the extent that the underlying Linux -Cryptographic API algorithms are. If a vulnerable algorithm is used, -such as a table-based implementation of AES, it may be possible for an -attacker to mount a side channel attack against the online system. -Side channel attacks may also be mounted against applications -consuming decrypted data. +Cryptographic API algorithms or inline encryption hardware are. If a +vulnerable algorithm is used, such as a table-based implementation of +AES, it may be possible for an attacker to mount a side channel attack +against the online system. Side channel attacks may also be mounted +against applications consuming decrypted data. Unauthorized file access ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1135,6 +1135,50 @@ where applications may later write sensitive data. It is recommended that systems implementing a form of "verified boot" take advantage of this by validating all top-level encryption policies prior to access. +Inline encryption support +========================= + +By default, fscrypt uses the kernel crypto API for all cryptographic +operations (other than HKDF, which fscrypt partially implements +itself). The kernel crypto API supports hardware crypto accelerators, +but only ones that work in the traditional way where all inputs and +outputs (e.g. plaintexts and ciphertexts) are in memory. fscrypt can +take advantage of such hardware, but the traditional acceleration +model isn't particularly efficient and fscrypt hasn't been optimized +for it. + +Instead, many newer systems (especially mobile SoCs) have *inline +encryption hardware* that can encrypt/decrypt data while it is on its +way to/from the storage device. Linux supports inline encryption +through a set of extensions to the block layer called *blk-crypto*. +blk-crypto allows filesystems to attach encryption contexts to bios +(I/O requests) to specify how the data will be encrypted or decrypted +in-line. For more information about blk-crypto, see +:ref:`Documentation/block/inline-encryption.rst `. + +On supported filesystems (currently ext4 and f2fs), fscrypt can use +blk-crypto instead of the kernel crypto API to encrypt/decrypt file +contents. To enable this, set CONFIG_FS_ENCRYPTION_INLINE_CRYPT=y in +the kernel configuration, and specify the "inlinecrypt" mount option +when mounting the filesystem. + +Note that the "inlinecrypt" mount option just specifies to use inline +encryption when possible; it doesn't force its use. fscrypt will +still fall back to using the kernel crypto API on files where the +inline encryption hardware doesn't have the needed crypto capabilities +(e.g. support for the needed encryption algorithm and data unit size) +and where blk-crypto-fallback is unusable. (For blk-crypto-fallback +to be usable, it must be enabled in the kernel configuration with +CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y.) + +Currently fscrypt always uses the filesystem block size (which is +usually 4096 bytes) as the data unit size. Therefore, it can only use +inline encryption hardware that supports that data unit size. + +Inline encryption doesn't affect the ciphertext or other aspects of +the on-disk format, so users may freely switch back and forth between +using "inlinecrypt" and not using "inlinecrypt". + Implementation details ====================== @@ -1184,6 +1228,13 @@ keys`_ and `DIRECT_KEY policies`_. Data path changes ----------------- +When inline encryption is used, filesystems just need to associate +encryption contexts with bios to specify how the block layer or the +inline encryption hardware will encrypt/decrypt the file contents. + +When inline encryption isn't used, filesystems must encrypt/decrypt +the file contents themselves, as described below: + For the read path (->readpage()) of regular files, filesystems can read the ciphertext into the page cache and decrypt it in-place. The page lock must be held until decryption has finished, to prevent the @@ -1197,18 +1248,6 @@ buffer. Some filesystems, such as UBIFS, already use temporary buffers regardless of encryption. Other filesystems, such as ext4 and F2FS, have to allocate bounce pages specially for encryption. -Fscrypt is also able to use inline encryption hardware instead of the -kernel crypto API for en/decryption of file contents. When possible, -and if directed to do so (by specifying the 'inlinecrypt' mount option -for an ext4/F2FS filesystem), it adds encryption contexts to bios and -uses blk-crypto to perform the en/decryption instead of making use of -the above read/write path changes. Of course, even if directed to -make use of inline encryption, fscrypt will only be able to do so if -either hardware inline encryption support is available for the -selected encryption algorithm or CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK -is selected. If neither is the case, fscrypt will fall back to using -the above mentioned read/write path changes for en/decryption. - Filename hashing and encoding ----------------------------- diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 68a2de6b5a9b..bfc2a5b74ed3 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -1,23 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 /* - * This contains encryption functions for per-file encryption. + * Utility functions for file contents encryption/decryption on + * block device-based filesystems. * * Copyright (C) 2015, Google, Inc. * Copyright (C) 2015, Motorola Mobility - * - * Written by Michael Halcrow, 2014. - * - * Filename encryption additions - * Uday Savagaonkar, 2014 - * Encryption policy handling additions - * Ildar Muslukhov, 2014 - * Add fscrypt_pullback_bio_page() - * Jaegeuk Kim, 2015. - * - * This has not yet undergone a rigorous security audit. - * - * The usage of AES-XTS should conform to recommendations in NIST - * Special Publication 800-38E and IEEE P1619/D16. */ #include @@ -26,6 +13,21 @@ #include #include "fscrypt_private.h" +/** + * fscrypt_decrypt_bio() - decrypt the contents of a bio + * @bio: the bio to decrypt + * + * Decrypt the contents of a "read" bio following successful completion of the + * underlying disk read. The bio must be reading a whole number of blocks of an + * encrypted file directly into the page cache. If the bio is reading the + * ciphertext into bounce pages instead of the page cache (for example, because + * the file is also compressed, so decompression is required after decryption), + * then this function isn't applicable. This function may sleep, so it must be + * called from a workqueue rather than from the bio's bi_end_io callback. + * + * This function sets PG_error on any pages that contain any blocks that failed + * to be decrypted. The filesystem must not mark such pages uptodate. + */ void fscrypt_decrypt_bio(struct bio *bio) { struct bio_vec *bv; diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index eb538c28df94..a9be4bc74a94 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -429,8 +429,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, if (fscrypt_has_encryption_key(dir)) { if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy, - iname->len, - dir->i_sb->s_cop->max_namelen, + iname->len, NAME_MAX, &fname->crypto_buf.len)) return -ENAMETOOLONG; fname->crypto_buf.name = kmalloc(fname->crypto_buf.len, diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 9322a1c984fb..bf75ecda9459 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -20,6 +20,11 @@ #define FSCRYPT_FILE_NONCE_SIZE 16 +/* + * Minimum size of an fscrypt master key. Note: a longer key will be required + * if ciphers with a 256-bit security strength are used. This is just the + * absolute minimum, which applies when only 128-bit encryption is used. + */ #define FSCRYPT_MIN_KEY_SIZE 16 #define FSCRYPT_MAX_HW_WRAPPED_KEY_SIZE 128 @@ -437,7 +442,11 @@ struct fscrypt_master_key_secret { */ struct fscrypt_hkdf hkdf; - /* Size of the raw key in bytes. Set even if ->raw isn't set. */ + /* + * Size of the raw key in bytes. This remains set even if ->raw was + * zeroized due to no longer being needed. I.e. we still remember the + * size of the key even if we don't need to remember the key itself. + */ u32 size; /* True if the key in ->raw is a hardware-wrapped key. */ diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 05f4693f7e15..bec2915effe5 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -122,8 +122,9 @@ err_free_tfm: /* * Prepare the crypto transform object or blk-crypto key in @prep_key, given the - * raw key, encryption mode, and flag indicating which encryption implementation - * (fs-layer or blk-crypto) will be used. + * raw key, encryption mode (@ci->ci_mode), flag indicating which encryption + * implementation (fs-layer or blk-crypto) will be used (@ci->ci_inlinecrypt), + * and IV generation method (@ci->ci_policy.flags). */ int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, unsigned int raw_key_size, diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 14b747026742..f57255ab88ed 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -6,16 +6,22 @@ config EROFS_FS select FS_IOMAP select LIBCRC32C help - EROFS (Enhanced Read-Only File System) is a lightweight - read-only file system with modern designs (eg. page-sized - blocks, inline xattrs/data, etc.) for scenarios which need - high-performance read-only requirements, e.g. Android OS - for mobile phones and LIVECDs. + EROFS (Enhanced Read-Only File System) is a lightweight read-only + file system with modern designs (e.g. no buffer heads, inline + xattrs/data, chunk-based deduplication, multiple devices, etc.) for + scenarios which need high-performance read-only solutions, e.g. + smartphones with Android OS, LiveCDs and high-density hosts with + numerous containers; - It also provides fixed-sized output compression support, - which improves storage density, keeps relatively higher - compression ratios, which is more useful to achieve high - performance for embedded devices with limited memory. + It also provides fixed-sized output compression support in order to + improve storage density as well as keep relatively higher compression + ratios and implements in-place decompression to reuse the file page + for compressed data temporarily with proper strategies, which is + quite useful to ensure guaranteed end-to-end runtime decompression + performance under extremely memory pressure without extra cost. + + See the documentation at + for more details. If unsure, say N. @@ -76,3 +82,19 @@ config EROFS_FS_ZIP Enable fixed-sized output compression for EROFS. If you don't want to enable compression feature, say N. + +config EROFS_FS_ZIP_LZMA + bool "EROFS LZMA compressed data support" + depends on EROFS_FS_ZIP + select XZ_DEC + select XZ_DEC_MICROLZMA + help + Saying Y here includes support for reading EROFS file systems + containing LZMA compressed data, specifically called microLZMA. it + gives better compression ratios than the LZ4 algorithm, at the + expense of more CPU overhead. + + LZMA support is an experimental feature for now and so most file + systems will be readable without selecting this option. + + If unsure, say N. diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile index 1f9aced49070..756fe2d65272 100644 --- a/fs/erofs/Makefile +++ b/fs/erofs/Makefile @@ -4,3 +4,4 @@ obj-$(CONFIG_EROFS_FS) += erofs.o erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o +erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 3701c72bacb2..579406504919 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -8,11 +8,6 @@ #include "internal.h" -enum { - Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX, - Z_EROFS_COMPRESSION_RUNTIME_MAX -}; - struct z_erofs_decompress_req { struct super_block *sb; struct page **in, **out; @@ -25,6 +20,12 @@ struct z_erofs_decompress_req { bool inplace_io, partial_decoding; }; +struct z_erofs_decompressor { + int (*decompress)(struct z_erofs_decompress_req *rq, + struct page **pagepool); + char *name; +}; + /* some special page->private (unsigned long, see below) */ #define Z_EROFS_SHORTLIVED_PAGE (-1UL << 2) #define Z_EROFS_PREALLOCATED_PAGE (-2UL << 2) @@ -63,7 +64,7 @@ static inline bool z_erofs_is_shortlived_page(struct page *page) return true; } -static inline bool z_erofs_put_shortlivedpage(struct list_head *pagepool, +static inline bool z_erofs_put_shortlivedpage(struct page **pagepool, struct page *page) { if (!z_erofs_is_shortlived_page(page)) @@ -74,13 +75,22 @@ static inline bool z_erofs_put_shortlivedpage(struct list_head *pagepool, put_page(page); } else { /* follow the pcluster rule above. */ - set_page_private(page, 0); - list_add(&page->lru, pagepool); + erofs_pagepool_add(pagepool, page); } return true; } -int z_erofs_decompress(struct z_erofs_decompress_req *rq, - struct list_head *pagepool); +#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) +static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, + struct page *page) +{ + return page->mapping == MNGD_MAPPING(sbi); +} +int z_erofs_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool); + +/* prototypes for specific algorithms */ +int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool); #endif diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 9db829715652..808234d9190c 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -89,6 +89,7 @@ static int erofs_map_blocks(struct inode *inode, erofs_off_t pos; int err = 0; + map->m_deviceid = 0; if (map->m_la >= inode->i_size) { /* leave out-of-bound access unmapped */ map->m_flags = 0; @@ -135,14 +136,8 @@ static int erofs_map_blocks(struct inode *inode, map->m_flags = 0; break; default: - /* only one device is supported for now */ - if (idx->device_id) { - erofs_err(sb, "invalid device id %u @ %llu for nid %llu", - le16_to_cpu(idx->device_id), - chunknr, vi->nid); - err = -EFSCORRUPTED; - goto out_unlock; - } + map->m_deviceid = le16_to_cpu(idx->device_id) & + EROFS_SB(sb)->device_id_mask; map->m_pa = blknr_to_addr(le32_to_cpu(idx->blkaddr)); map->m_flags = EROFS_MAP_MAPPED; break; @@ -155,11 +150,55 @@ out: return err; } +int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) +{ + struct erofs_dev_context *devs = EROFS_SB(sb)->devs; + struct erofs_device_info *dif; + int id; + + /* primary device by default */ + map->m_bdev = sb->s_bdev; + map->m_daxdev = EROFS_SB(sb)->dax_dev; + + if (map->m_deviceid) { + down_read(&devs->rwsem); + dif = idr_find(&devs->tree, map->m_deviceid - 1); + if (!dif) { + up_read(&devs->rwsem); + return -ENODEV; + } + map->m_bdev = dif->bdev; + map->m_daxdev = dif->dax_dev; + up_read(&devs->rwsem); + } else if (devs->extra_devices) { + down_read(&devs->rwsem); + idr_for_each_entry(&devs->tree, dif, id) { + erofs_off_t startoff, length; + + if (!dif->mapped_blkaddr) + continue; + startoff = blknr_to_addr(dif->mapped_blkaddr); + length = blknr_to_addr(dif->blocks); + + if (map->m_pa >= startoff && + map->m_pa < startoff + length) { + map->m_pa -= startoff; + map->m_bdev = dif->bdev; + map->m_daxdev = dif->dax_dev; + break; + } + } + up_read(&devs->rwsem); + } + return 0; +} + static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { int ret; struct erofs_map_blocks map; + struct erofs_map_dev mdev; map.m_la = offset; map.m_llen = length; @@ -168,8 +207,16 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, if (ret < 0) return ret; - iomap->bdev = inode->i_sb->s_bdev; - iomap->dax_dev = EROFS_I_SB(inode)->dax_dev; + mdev = (struct erofs_map_dev) { + .m_deviceid = map.m_deviceid, + .m_pa = map.m_pa, + }; + ret = erofs_map_dev(inode->i_sb, &mdev); + if (ret) + return ret; + + iomap->bdev = mdev.m_bdev; + iomap->dax_dev = mdev.m_daxdev; iomap->offset = map.m_la; iomap->length = map.m_llen; iomap->flags = 0; @@ -188,15 +235,15 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->type = IOMAP_INLINE; ipage = erofs_get_meta_page(inode->i_sb, - erofs_blknr(map.m_pa)); + erofs_blknr(mdev.m_pa)); if (IS_ERR(ipage)) return PTR_ERR(ipage); iomap->inline_data = page_address(ipage) + - erofs_blkoff(map.m_pa); + erofs_blkoff(mdev.m_pa); iomap->private = ipage; } else { iomap->type = IOMAP_MAPPED; - iomap->addr = map.m_pa; + iomap->addr = mdev.m_pa; } return 0; } diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index ad3f31380e6b..bf37fc76b182 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -16,17 +16,6 @@ #define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32) #endif -struct z_erofs_decompressor { - /* - * if destpages have sparsed pages, fill them with bounce pages. - * it also check whether destpages indicate continuous physical memory. - */ - int (*prepare_destpages)(struct z_erofs_decompress_req *rq, - struct list_head *pagepool); - int (*decompress)(struct z_erofs_decompress_req *rq, u8 *out); - char *name; -}; - int z_erofs_load_lz4_config(struct super_block *sb, struct erofs_super_block *dsb, struct z_erofs_lz4_cfgs *lz4, int size) @@ -63,8 +52,12 @@ int z_erofs_load_lz4_config(struct super_block *sb, return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks); } -static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq, - struct list_head *pagepool) +/* + * Fill all gaps with bounce pages if it's a sparse page list. Also check if + * all physical pages are consecutive, which can be seen for moderate CR. + */ +static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq, + struct page **pagepool) { const unsigned int nr = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; @@ -119,7 +112,7 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq, return kaddr ? 1 : 0; } -static void *z_erofs_handle_inplace_io(struct z_erofs_decompress_req *rq, +static void *z_erofs_lz4_handle_inplace_io(struct z_erofs_decompress_req *rq, void *inpage, unsigned int *inputmargin, int *maptype, bool support_0padding) { @@ -189,7 +182,8 @@ docopy: return src; } -static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) +static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, + u8 *out) { unsigned int inputmargin; u8 *headpage, *src; @@ -216,8 +210,8 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) } rq->inputsize -= inputmargin; - src = z_erofs_handle_inplace_io(rq, headpage, &inputmargin, &maptype, - support_0padding); + src = z_erofs_lz4_handle_inplace_io(rq, headpage, &inputmargin, + &maptype, support_0padding); if (IS_ERR(src)) return PTR_ERR(src); @@ -241,6 +235,8 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) if (ret >= 0) memset(out + ret, 0, rq->outputsize - ret); ret = -EIO; + } else { + ret = 0; } if (maptype == 0) { @@ -256,86 +252,25 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) return ret; } -static struct z_erofs_decompressor decompressors[] = { - [Z_EROFS_COMPRESSION_SHIFTED] = { - .name = "shifted" - }, - [Z_EROFS_COMPRESSION_LZ4] = { - .prepare_destpages = z_erofs_lz4_prepare_destpages, - .decompress = z_erofs_lz4_decompress, - .name = "lz4" - }, -}; - -static void copy_from_pcpubuf(struct page **out, const char *dst, - unsigned short pageofs_out, - unsigned int outputsize) -{ - const char *end = dst + outputsize; - const unsigned int righthalf = PAGE_SIZE - pageofs_out; - const char *cur = dst - pageofs_out; - - while (cur < end) { - struct page *const page = *out++; - - if (page) { - char *buf = kmap_atomic(page); - - if (cur >= dst) { - memcpy(buf, cur, min_t(uint, PAGE_SIZE, - end - cur)); - } else { - memcpy(buf + pageofs_out, cur + pageofs_out, - min_t(uint, righthalf, end - cur)); - } - kunmap_atomic(buf); - } - cur += PAGE_SIZE; - } -} - -static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq, - struct list_head *pagepool) +static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool) { const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; - const struct z_erofs_decompressor *alg = decompressors + rq->alg; unsigned int dst_maptype; void *dst; int ret; - /* two optimized fast paths only for non bigpcluster cases yet */ - if (rq->inputsize <= PAGE_SIZE) { - if (nrpages_out == 1 && !rq->inplace_io) { - DBG_BUGON(!*rq->out); - dst = kmap_atomic(*rq->out); - dst_maptype = 0; - goto dstmap_out; - } - - /* - * For the case of small output size (especially much less - * than PAGE_SIZE), memcpy the decompressed data rather than - * compressed data is preferred. - */ - if (rq->outputsize <= PAGE_SIZE * 7 / 8) { - dst = erofs_get_pcpubuf(1); - if (IS_ERR(dst)) - return PTR_ERR(dst); - - rq->inplace_io = false; - ret = alg->decompress(rq, dst); - if (!ret) - copy_from_pcpubuf(rq->out, dst, rq->pageofs_out, - rq->outputsize); - - erofs_put_pcpubuf(dst); - return ret; - } + /* one optimized fast path only for non bigpcluster cases yet */ + if (rq->inputsize <= PAGE_SIZE && nrpages_out == 1 && !rq->inplace_io) { + DBG_BUGON(!*rq->out); + dst = kmap_atomic(*rq->out); + dst_maptype = 0; + goto dstmap_out; } /* general decoding path which can be used for all cases */ - ret = alg->prepare_destpages(rq, pagepool); + ret = z_erofs_lz4_prepare_dstpages(rq, pagepool); if (ret < 0) return ret; if (ret) { @@ -350,7 +285,7 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq, dst_maptype = 2; dstmap_out: - ret = alg->decompress(rq, dst + rq->pageofs_out); + ret = z_erofs_lz4_decompress_mem(rq, dst + rq->pageofs_out); if (!dst_maptype) kunmap_atomic(dst); @@ -359,8 +294,8 @@ dstmap_out: return ret; } -static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq, - struct list_head *pagepool) +static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq, + struct page **pagepool) { const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; @@ -398,10 +333,25 @@ static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq, return 0; } +static struct z_erofs_decompressor decompressors[] = { + [Z_EROFS_COMPRESSION_SHIFTED] = { + .decompress = z_erofs_shifted_transform, + .name = "shifted" + }, + [Z_EROFS_COMPRESSION_LZ4] = { + .decompress = z_erofs_lz4_decompress, + .name = "lz4" + }, +#ifdef CONFIG_EROFS_FS_ZIP_LZMA + [Z_EROFS_COMPRESSION_LZMA] = { + .decompress = z_erofs_lzma_decompress, + .name = "lzma" + }, +#endif +}; + int z_erofs_decompress(struct z_erofs_decompress_req *rq, - struct list_head *pagepool) + struct page **pagepool) { - if (rq->alg == Z_EROFS_COMPRESSION_SHIFTED) - return z_erofs_shifted_transform(rq, pagepool); - return z_erofs_decompress_generic(rq, pagepool); + return decompressors[rq->alg].decompress(rq, pagepool); } diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c new file mode 100644 index 000000000000..50045510a1f4 --- /dev/null +++ b/fs/erofs/decompressor_lzma.c @@ -0,0 +1,290 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include +#include +#include "compress.h" + +struct z_erofs_lzma { + struct z_erofs_lzma *next; + struct xz_dec_microlzma *state; + struct xz_buf buf; + u8 bounce[PAGE_SIZE]; +}; + +/* considering the LZMA performance, no need to use a lockless list for now */ +static DEFINE_SPINLOCK(z_erofs_lzma_lock); +static unsigned int z_erofs_lzma_max_dictsize; +static unsigned int z_erofs_lzma_nstrms, z_erofs_lzma_avail_strms; +static struct z_erofs_lzma *z_erofs_lzma_head; +static DECLARE_WAIT_QUEUE_HEAD(z_erofs_lzma_wq); + +module_param_named(lzma_streams, z_erofs_lzma_nstrms, uint, 0444); + +void z_erofs_lzma_exit(void) +{ + /* there should be no running fs instance */ + while (z_erofs_lzma_avail_strms) { + struct z_erofs_lzma *strm; + + spin_lock(&z_erofs_lzma_lock); + strm = z_erofs_lzma_head; + if (!strm) { + spin_unlock(&z_erofs_lzma_lock); + DBG_BUGON(1); + return; + } + z_erofs_lzma_head = NULL; + spin_unlock(&z_erofs_lzma_lock); + + while (strm) { + struct z_erofs_lzma *n = strm->next; + + if (strm->state) + xz_dec_microlzma_end(strm->state); + kfree(strm); + --z_erofs_lzma_avail_strms; + strm = n; + } + } +} + +int z_erofs_lzma_init(void) +{ + unsigned int i; + + /* by default, use # of possible CPUs instead */ + if (!z_erofs_lzma_nstrms) + z_erofs_lzma_nstrms = num_possible_cpus(); + + for (i = 0; i < z_erofs_lzma_nstrms; ++i) { + struct z_erofs_lzma *strm = kzalloc(sizeof(*strm), GFP_KERNEL); + + if (!strm) { + z_erofs_lzma_exit(); + return -ENOMEM; + } + spin_lock(&z_erofs_lzma_lock); + strm->next = z_erofs_lzma_head; + z_erofs_lzma_head = strm; + spin_unlock(&z_erofs_lzma_lock); + ++z_erofs_lzma_avail_strms; + } + return 0; +} + +int z_erofs_load_lzma_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_lzma_cfgs *lzma, int size) +{ + static DEFINE_MUTEX(lzma_resize_mutex); + unsigned int dict_size, i; + struct z_erofs_lzma *strm, *head = NULL; + int err; + + if (!lzma || size < sizeof(struct z_erofs_lzma_cfgs)) { + erofs_err(sb, "invalid lzma cfgs, size=%u", size); + return -EINVAL; + } + if (lzma->format) { + erofs_err(sb, "unidentified lzma format %x, please check kernel version", + le16_to_cpu(lzma->format)); + return -EINVAL; + } + dict_size = le32_to_cpu(lzma->dict_size); + if (dict_size > Z_EROFS_LZMA_MAX_DICT_SIZE || dict_size < 4096) { + erofs_err(sb, "unsupported lzma dictionary size %u", + dict_size); + return -EINVAL; + } + + erofs_info(sb, "EXPERIMENTAL MicroLZMA in use. Use at your own risk!"); + + /* in case 2 z_erofs_load_lzma_config() race to avoid deadlock */ + mutex_lock(&lzma_resize_mutex); + + if (z_erofs_lzma_max_dictsize >= dict_size) { + mutex_unlock(&lzma_resize_mutex); + return 0; + } + + /* 1. collect/isolate all streams for the following check */ + for (i = 0; i < z_erofs_lzma_avail_strms; ++i) { + struct z_erofs_lzma *last; + +again: + spin_lock(&z_erofs_lzma_lock); + strm = z_erofs_lzma_head; + if (!strm) { + spin_unlock(&z_erofs_lzma_lock); + wait_event(z_erofs_lzma_wq, + READ_ONCE(z_erofs_lzma_head)); + goto again; + } + z_erofs_lzma_head = NULL; + spin_unlock(&z_erofs_lzma_lock); + + for (last = strm; last->next; last = last->next) + ++i; + last->next = head; + head = strm; + } + + err = 0; + /* 2. walk each isolated stream and grow max dict_size if needed */ + for (strm = head; strm; strm = strm->next) { + if (strm->state) + xz_dec_microlzma_end(strm->state); + strm->state = xz_dec_microlzma_alloc(XZ_PREALLOC, dict_size); + if (!strm->state) + err = -ENOMEM; + } + + /* 3. push back all to the global list and update max dict_size */ + spin_lock(&z_erofs_lzma_lock); + DBG_BUGON(z_erofs_lzma_head); + z_erofs_lzma_head = head; + spin_unlock(&z_erofs_lzma_lock); + + z_erofs_lzma_max_dictsize = dict_size; + mutex_unlock(&lzma_resize_mutex); + return err; +} + +int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool) +{ + const unsigned int nrpages_out = + PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + const unsigned int nrpages_in = + PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; + unsigned int inputmargin, inlen, outlen, pageofs; + struct z_erofs_lzma *strm; + u8 *kin; + bool bounced = false; + int no, ni, j, err = 0; + + /* 1. get the exact LZMA compressed size */ + kin = kmap(*rq->in); + inputmargin = 0; + while (!kin[inputmargin & ~PAGE_MASK]) + if (!(++inputmargin & ~PAGE_MASK)) + break; + + if (inputmargin >= PAGE_SIZE) { + kunmap(*rq->in); + return -EFSCORRUPTED; + } + rq->inputsize -= inputmargin; + + /* 2. get an available lzma context */ +again: + spin_lock(&z_erofs_lzma_lock); + strm = z_erofs_lzma_head; + if (!strm) { + spin_unlock(&z_erofs_lzma_lock); + wait_event(z_erofs_lzma_wq, READ_ONCE(z_erofs_lzma_head)); + goto again; + } + z_erofs_lzma_head = strm->next; + spin_unlock(&z_erofs_lzma_lock); + + /* 3. multi-call decompress */ + inlen = rq->inputsize; + outlen = rq->outputsize; + xz_dec_microlzma_reset(strm->state, inlen, outlen, + !rq->partial_decoding); + pageofs = rq->pageofs_out; + strm->buf.in = kin + inputmargin; + strm->buf.in_pos = 0; + strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE - inputmargin); + inlen -= strm->buf.in_size; + strm->buf.out = NULL; + strm->buf.out_pos = 0; + strm->buf.out_size = 0; + + for (ni = 0, no = -1;;) { + enum xz_ret xz_err; + + if (strm->buf.out_pos == strm->buf.out_size) { + if (strm->buf.out) { + kunmap(rq->out[no]); + strm->buf.out = NULL; + } + + if (++no >= nrpages_out || !outlen) { + erofs_err(rq->sb, "decompressed buf out of bound"); + err = -EFSCORRUPTED; + break; + } + strm->buf.out_pos = 0; + strm->buf.out_size = min_t(u32, outlen, + PAGE_SIZE - pageofs); + outlen -= strm->buf.out_size; + if (rq->out[no]) + strm->buf.out = kmap(rq->out[no]) + pageofs; + pageofs = 0; + } else if (strm->buf.in_pos == strm->buf.in_size) { + kunmap(rq->in[ni]); + + if (++ni >= nrpages_in || !inlen) { + erofs_err(rq->sb, "compressed buf out of bound"); + err = -EFSCORRUPTED; + break; + } + strm->buf.in_pos = 0; + strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE); + inlen -= strm->buf.in_size; + kin = kmap(rq->in[ni]); + strm->buf.in = kin; + bounced = false; + } + + /* + * Handle overlapping: Use bounced buffer if the compressed + * data is under processing; Otherwise, Use short-lived pages + * from the on-stack pagepool where pages share with the same + * request. + */ + if (!bounced && rq->out[no] == rq->in[ni]) { + memcpy(strm->bounce, strm->buf.in, strm->buf.in_size); + strm->buf.in = strm->bounce; + bounced = true; + } + for (j = ni + 1; j < nrpages_in; ++j) { + struct page *tmppage; + + if (rq->out[no] != rq->in[j]) + continue; + + DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb), + rq->in[j])); + tmppage = erofs_allocpage(pagepool, + GFP_KERNEL | __GFP_NOFAIL); + set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); + copy_highpage(tmppage, rq->in[j]); + rq->in[j] = tmppage; + } + xz_err = xz_dec_microlzma_run(strm->state, &strm->buf); + DBG_BUGON(strm->buf.out_pos > strm->buf.out_size); + DBG_BUGON(strm->buf.in_pos > strm->buf.in_size); + + if (xz_err != XZ_OK) { + if (xz_err == XZ_STREAM_END && !outlen) + break; + erofs_err(rq->sb, "failed to decompress %d in[%u] out[%u]", + xz_err, rq->inputsize, rq->outputsize); + err = -EFSCORRUPTED; + break; + } + } + if (no < nrpages_out && strm->buf.out) + kunmap(rq->in[no]); + if (ni < nrpages_in) + kunmap(rq->in[ni]); + /* 4. push back LZMA stream context to the global list */ + spin_lock(&z_erofs_lzma_lock); + strm->next = z_erofs_lzma_head; + z_erofs_lzma_head = strm; + spin_unlock(&z_erofs_lzma_lock); + wake_up(&z_erofs_lzma_wq); + return err; +} diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index b0b23f41abc3..083997a034e5 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -21,14 +21,29 @@ #define EROFS_FEATURE_INCOMPAT_COMPR_CFGS 0x00000002 #define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER 0x00000002 #define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE 0x00000004 +#define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE 0x00000008 +#define EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 0x00000008 #define EROFS_ALL_FEATURE_INCOMPAT \ (EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \ EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \ - EROFS_FEATURE_INCOMPAT_CHUNKED_FILE) + EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \ + EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \ + EROFS_FEATURE_INCOMPAT_COMPR_HEAD2) #define EROFS_SB_EXTSLOT_SIZE 16 +struct erofs_deviceslot { + union { + u8 uuid[16]; /* used for device manager later */ + u8 userdata[64]; /* digest(sha256), etc. */ + } u; + __le32 blocks; /* total fs blocks of this device */ + __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */ + u8 reserved[56]; +}; +#define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot) + /* erofs on-disk super block (currently 128 bytes) */ struct erofs_super_block { __le32 magic; /* file system magic number */ @@ -54,7 +69,9 @@ struct erofs_super_block { /* customized sliding window size instead of 64k by default */ __le16 lz4_max_distance; } __packed u1; - __u8 reserved2[42]; + __le16 extra_devices; /* # of devices besides the primary device */ + __le16 devt_slotoff; /* startoff = devt_slotoff * devt_slotsize */ + __u8 reserved2[38]; }; /* @@ -238,7 +255,7 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e) /* 8-byte inode chunk indexes */ struct erofs_inode_chunk_index { __le16 advise; /* always 0, don't care for now */ - __le16 device_id; /* back-end storage id, always 0 for now */ + __le16 device_id; /* back-end storage id (with bits masked) */ __le32 blkaddr; /* start block address of this inode chunk */ }; @@ -247,10 +264,11 @@ struct erofs_inode_chunk_index { /* available compression algorithm types (for h_algorithmtype) */ enum { - Z_EROFS_COMPRESSION_LZ4 = 0, + Z_EROFS_COMPRESSION_LZ4 = 0, + Z_EROFS_COMPRESSION_LZMA = 1, Z_EROFS_COMPRESSION_MAX }; -#define Z_EROFS_ALL_COMPR_ALGS (1 << (Z_EROFS_COMPRESSION_MAX - 1)) +#define Z_EROFS_ALL_COMPR_ALGS ((1 << Z_EROFS_COMPRESSION_MAX) - 1) /* 14 bytes (+ length field = 16 bytes) */ struct z_erofs_lz4_cfgs { @@ -259,6 +277,15 @@ struct z_erofs_lz4_cfgs { u8 reserved[10]; } __packed; +/* 14 bytes (+ length field = 16 bytes) */ +struct z_erofs_lzma_cfgs { + __le32 dict_size; + __le16 format; + u8 reserved[8]; +} __packed; + +#define Z_EROFS_LZMA_MAX_DICT_SIZE (8 * Z_EROFS_PCLUSTER_MAX_SIZE) + /* * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on) * e.g. for 4k logical cluster size, 4B if compacted 2B is off; @@ -288,35 +315,34 @@ struct z_erofs_map_header { #define Z_EROFS_VLE_LEGACY_HEADER_PADDING 8 /* - * Fixed-sized output compression ondisk Logical Extent cluster type: - * 0 - literal (uncompressed) cluster - * 1 - compressed cluster (for the head logical cluster) - * 2 - compressed cluster (for the other logical clusters) + * Fixed-sized output compression on-disk logical cluster type: + * 0 - literal (uncompressed) lcluster + * 1,3 - compressed lcluster (for HEAD lclusters) + * 2 - compressed lcluster (for NONHEAD lclusters) * * In detail, - * 0 - literal (uncompressed) cluster, + * 0 - literal (uncompressed) lcluster, * di_advise = 0 - * di_clusterofs = the literal data offset of the cluster - * di_blkaddr = the blkaddr of the literal cluster + * di_clusterofs = the literal data offset of the lcluster + * di_blkaddr = the blkaddr of the literal pcluster * - * 1 - compressed cluster (for the head logical cluster) - * di_advise = 1 - * di_clusterofs = the decompressed data offset of the cluster - * di_blkaddr = the blkaddr of the compressed cluster + * 1,3 - compressed lcluster (for HEAD lclusters) + * di_advise = 1 or 3 + * di_clusterofs = the decompressed data offset of the lcluster + * di_blkaddr = the blkaddr of the compressed pcluster * - * 2 - compressed cluster (for the other logical clusters) + * 2 - compressed lcluster (for NONHEAD lclusters) * di_advise = 2 * di_clusterofs = - * the decompressed data offset in its own head cluster - * di_u.delta[0] = distance to its corresponding head cluster - * di_u.delta[1] = distance to its corresponding tail cluster - * (di_advise could be 0, 1 or 2) + * the decompressed data offset in its own HEAD lcluster + * di_u.delta[0] = distance to this HEAD lcluster + * di_u.delta[1] = distance to the next HEAD lcluster */ enum { Z_EROFS_VLE_CLUSTER_TYPE_PLAIN = 0, - Z_EROFS_VLE_CLUSTER_TYPE_HEAD = 1, + Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 = 1, Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD = 2, - Z_EROFS_VLE_CLUSTER_TYPE_RESERVED = 3, + Z_EROFS_VLE_CLUSTER_TYPE_HEAD2 = 3, Z_EROFS_VLE_CLUSTER_TYPE_MAX }; @@ -384,6 +410,7 @@ static inline void erofs_check_ondisk_layout_definitions(void) /* keep in sync between 2 index structures for better extendibility */ BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) != sizeof(struct z_erofs_vle_decompressed_index)); + BUILD_BUG_ON(sizeof(struct erofs_deviceslot) != 128); BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) < Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index a552399e211d..2345f1de438e 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -192,7 +192,7 @@ static struct page *erofs_read_inode(struct inode *inode, inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec; inode->i_flags &= ~S_DAX; - if (test_opt(&sbi->ctx, DAX_ALWAYS) && S_ISREG(inode->i_mode) && + if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) && vi->datalayout == EROFS_INODE_FLAT_PLAIN) inode->i_flags |= S_DAX; if (!nblks) diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 9524e155b38f..3265688af7f9 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -47,7 +47,16 @@ typedef u64 erofs_off_t; /* data type for filesystem-wide blocks number */ typedef u32 erofs_blk_t; -struct erofs_fs_context { +struct erofs_device_info { + char *path; + struct block_device *bdev; + struct dax_device *dax_dev; + + u32 blocks; + u32 mapped_blkaddr; +}; + +struct erofs_mount_opts { #ifdef CONFIG_EROFS_FS_ZIP /* current strategy of how to use managed cache */ unsigned char cache_strategy; @@ -60,6 +69,18 @@ struct erofs_fs_context { unsigned int mount_opt; }; +struct erofs_dev_context { + struct idr tree; + struct rw_semaphore rwsem; + + unsigned int extra_devices; +}; + +struct erofs_fs_context { + struct erofs_mount_opts opt; + struct erofs_dev_context *devs; +}; + /* all filesystem-wide lz4 configurations */ struct erofs_sb_lz4_info { /* # of pages needed for EROFS lz4 rolling decompression */ @@ -69,6 +90,7 @@ struct erofs_sb_lz4_info { }; struct erofs_sb_info { + struct erofs_mount_opts opt; /* options */ #ifdef CONFIG_EROFS_FS_ZIP /* list for all registered superblocks, mainly for shrinker */ struct list_head list; @@ -85,12 +107,16 @@ struct erofs_sb_info { struct erofs_sb_lz4_info lz4; #endif /* CONFIG_EROFS_FS_ZIP */ + struct erofs_dev_context *devs; struct dax_device *dax_dev; - u32 blocks; + u64 total_blocks; + u32 primarydevice_blocks; + u32 meta_blkaddr; #ifdef CONFIG_EROFS_FS_XATTR u32 xattr_blkaddr; #endif + u16 device_id_mask; /* valid bits of device id to be used */ /* inode slot unit size in bit shift */ unsigned char islotbits; @@ -108,8 +134,6 @@ struct erofs_sb_info { u8 volume_name[16]; /* volume name */ u32 feature_compat; u32 feature_incompat; - - struct erofs_fs_context ctx; /* options */ }; #define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info) @@ -121,9 +145,9 @@ struct erofs_sb_info { #define EROFS_MOUNT_DAX_ALWAYS 0x00000040 #define EROFS_MOUNT_DAX_NEVER 0x00000080 -#define clear_opt(ctx, option) ((ctx)->mount_opt &= ~EROFS_MOUNT_##option) -#define set_opt(ctx, option) ((ctx)->mount_opt |= EROFS_MOUNT_##option) -#define test_opt(ctx, option) ((ctx)->mount_opt & EROFS_MOUNT_##option) +#define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option) +#define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option) +#define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option) enum { EROFS_ZIP_CACHE_DISABLED, @@ -237,6 +261,7 @@ static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \ EROFS_FEATURE_FUNCS(lz4_0padding, incompat, INCOMPAT_LZ4_0PADDING) EROFS_FEATURE_FUNCS(compr_cfgs, incompat, INCOMPAT_COMPR_CFGS) EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER) +EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) /* atomic flag definitions */ @@ -307,6 +332,19 @@ static inline unsigned int erofs_inode_datalayout(unsigned int value) EROFS_I_DATALAYOUT_BITS); } +/* + * Different from grab_cache_page_nowait(), reclaiming is never triggered + * when allocating new pages. + */ +static inline +struct page *erofs_grab_cache_page_nowait(struct address_space *mapping, + pgoff_t index) +{ + return pagecache_get_page(mapping, index, + FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT, + readahead_gfp_mask(mapping) & ~__GFP_RECLAIM); +} + extern const struct super_operations erofs_sops; extern const struct address_space_operations erofs_raw_access_aops; @@ -338,7 +376,7 @@ extern const struct address_space_operations z_erofs_aops; * of the corresponding uncompressed data in the file. */ enum { - BH_Zipped = BH_PrivateStart, + BH_Encoded = BH_PrivateStart, BH_FullMapped, }; @@ -346,8 +384,8 @@ enum { #define EROFS_MAP_MAPPED (1 << BH_Mapped) /* Located in metadata (could be copied from bd_inode) */ #define EROFS_MAP_META (1 << BH_Meta) -/* The extent has been compressed */ -#define EROFS_MAP_ZIPPED (1 << BH_Zipped) +/* The extent is encoded */ +#define EROFS_MAP_ENCODED (1 << BH_Encoded) /* The length of extent is full */ #define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped) @@ -355,6 +393,8 @@ struct erofs_map_blocks { erofs_off_t m_pa, m_la; u64 m_plen, m_llen; + unsigned short m_deviceid; + char m_algorithmformat; unsigned int m_flags; struct page *mpage; @@ -367,6 +407,13 @@ struct erofs_map_blocks { * approach instead if possible since it's more metadata lightweight.) */ #define EROFS_GET_BLOCKS_FIEMAP 0x0002 +/* Used to map the whole extent if non-negligible data is requested for LZMA */ +#define EROFS_GET_BLOCKS_READMORE 0x0004 + +enum { + Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX, + Z_EROFS_COMPRESSION_RUNTIME_MAX +}; /* zmap.c */ extern const struct iomap_ops z_erofs_iomap_report_ops; @@ -386,9 +433,18 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode, } #endif /* !CONFIG_EROFS_FS_ZIP */ +struct erofs_map_dev { + struct block_device *m_bdev; + struct dax_device *m_daxdev; + + erofs_off_t m_pa; + unsigned int m_deviceid; +}; + /* data.c */ extern const struct file_operations erofs_file_fops; struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr); +int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); @@ -443,7 +499,14 @@ void erofs_pcpubuf_init(void); void erofs_pcpubuf_exit(void); /* utils.c / zdata.c */ -struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp); +struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp); +static inline void erofs_pagepool_add(struct page **pagepool, + struct page *page) +{ + set_page_private(page, (unsigned long)*pagepool); + *pagepool = page; +} +void erofs_release_pages(struct page **pagepool); #ifdef CONFIG_EROFS_FS_ZIP int erofs_workgroup_put(struct erofs_workgroup *grp); @@ -483,6 +546,26 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb, } #endif /* !CONFIG_EROFS_FS_ZIP */ +#ifdef CONFIG_EROFS_FS_ZIP_LZMA +int z_erofs_lzma_init(void); +void z_erofs_lzma_exit(void); +int z_erofs_load_lzma_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_lzma_cfgs *lzma, int size); +#else +static inline int z_erofs_lzma_init(void) { return 0; } +static inline int z_erofs_lzma_exit(void) { return 0; } +static inline int z_erofs_load_lzma_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_lzma_cfgs *lzma, int size) { + if (lzma) { + erofs_err(sb, "lzma algorithm isn't enabled"); + return -EINVAL; + } + return 0; +} +#endif /* !CONFIG_EROFS_FS_ZIP */ + #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #endif /* __EROFS_INTERNAL_H */ diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c index 6c885575128a..a2efd833d1b6 100644 --- a/fs/erofs/pcpubuf.c +++ b/fs/erofs/pcpubuf.c @@ -49,7 +49,7 @@ int erofs_pcpubuf_growsize(unsigned int nrpages) { static DEFINE_MUTEX(pcb_resize_mutex); static unsigned int pcb_nrpages; - LIST_HEAD(pagepool); + struct page *pagepool = NULL; int delta, cpu, ret, i; mutex_lock(&pcb_resize_mutex); @@ -102,13 +102,13 @@ int erofs_pcpubuf_growsize(unsigned int nrpages) vunmap(old_ptr); free_pagearray: while (i) - list_add(&oldpages[--i]->lru, &pagepool); + erofs_pagepool_add(&pagepool, oldpages[--i]); kfree(oldpages); if (ret) break; } pcb_nrpages = nrpages; - put_pages_list(&pagepool); + erofs_release_pages(&pagepool); out: mutex_unlock(&pcb_resize_mutex); return ret; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 11b88559f8bf..6a969b1e0ee6 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -225,6 +225,9 @@ static int erofs_load_compr_cfgs(struct super_block *sb, case Z_EROFS_COMPRESSION_LZ4: ret = z_erofs_load_lz4_config(sb, dsb, data, size); break; + case Z_EROFS_COMPRESSION_LZMA: + ret = z_erofs_load_lzma_config(sb, dsb, data, size); + break; default: DBG_BUGON(1); ret = -EFAULT; @@ -252,6 +255,79 @@ static int erofs_load_compr_cfgs(struct super_block *sb, } #endif +static int erofs_init_devices(struct super_block *sb, + struct erofs_super_block *dsb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + unsigned int ondisk_extradevs; + erofs_off_t pos; + struct page *page = NULL; + struct erofs_device_info *dif; + struct erofs_deviceslot *dis; + void *ptr; + int id, err = 0; + + sbi->total_blocks = sbi->primarydevice_blocks; + if (!erofs_sb_has_device_table(sbi)) + ondisk_extradevs = 0; + else + ondisk_extradevs = le16_to_cpu(dsb->extra_devices); + + if (ondisk_extradevs != sbi->devs->extra_devices) { + erofs_err(sb, "extra devices don't match (ondisk %u, given %u)", + ondisk_extradevs, sbi->devs->extra_devices); + return -EINVAL; + } + if (!ondisk_extradevs) + return 0; + + sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1; + pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE; + down_read(&sbi->devs->rwsem); + idr_for_each_entry(&sbi->devs->tree, dif, id) { + erofs_blk_t blk = erofs_blknr(pos); + struct block_device *bdev; + + if (!page || page->index != blk) { + if (page) { + kunmap(page); + unlock_page(page); + put_page(page); + } + + page = erofs_get_meta_page(sb, blk); + if (IS_ERR(page)) { + up_read(&sbi->devs->rwsem); + return PTR_ERR(page); + } + ptr = kmap(page); + } + dis = ptr + erofs_blkoff(pos); + + bdev = blkdev_get_by_path(dif->path, + FMODE_READ | FMODE_EXCL, + sb->s_type); + if (IS_ERR(bdev)) { + err = PTR_ERR(bdev); + goto err_out; + } + dif->bdev = bdev; + dif->dax_dev = fs_dax_get_by_bdev(bdev); + dif->blocks = le32_to_cpu(dis->blocks); + dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr); + sbi->total_blocks += dif->blocks; + pos += EROFS_DEVT_SLOT_SIZE; + } +err_out: + up_read(&sbi->devs->rwsem); + if (page) { + kunmap(page); + unlock_page(page); + put_page(page); + } + return err; +} + static int erofs_read_superblock(struct super_block *sb) { struct erofs_sb_info *sbi; @@ -303,7 +379,7 @@ static int erofs_read_superblock(struct super_block *sb) sbi->sb_size); goto out; } - sbi->blocks = le32_to_cpu(dsb->blocks); + sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks); sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr); #ifdef CONFIG_EROFS_FS_XATTR sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); @@ -330,6 +406,11 @@ static int erofs_read_superblock(struct super_block *sb) ret = erofs_load_compr_cfgs(sb, dsb); else ret = z_erofs_load_lz4_config(sb, dsb, NULL, 0); + if (ret < 0) + goto out; + + /* handle multiple devices */ + ret = erofs_init_devices(sb, dsb); out: kunmap(page); put_page(page); @@ -340,15 +421,15 @@ out: static void erofs_default_options(struct erofs_fs_context *ctx) { #ifdef CONFIG_EROFS_FS_ZIP - ctx->cache_strategy = EROFS_ZIP_CACHE_READAROUND; - ctx->max_sync_decompress_pages = 3; - ctx->readahead_sync_decompress = false; + ctx->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND; + ctx->opt.max_sync_decompress_pages = 3; + ctx->opt.readahead_sync_decompress = false; #endif #ifdef CONFIG_EROFS_FS_XATTR - set_opt(ctx, XATTR_USER); + set_opt(&ctx->opt, XATTR_USER); #endif #ifdef CONFIG_EROFS_FS_POSIX_ACL - set_opt(ctx, POSIX_ACL); + set_opt(&ctx->opt, POSIX_ACL); #endif } @@ -358,6 +439,7 @@ enum { Opt_cache_strategy, Opt_dax, Opt_dax_enum, + Opt_device, Opt_err }; @@ -381,6 +463,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = { erofs_param_cache_strategy), fsparam_flag("dax", Opt_dax), fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums), + fsparam_string("device", Opt_device), {} }; @@ -392,12 +475,12 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode) switch (mode) { case EROFS_MOUNT_DAX_ALWAYS: warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - set_opt(ctx, DAX_ALWAYS); - clear_opt(ctx, DAX_NEVER); + set_opt(&ctx->opt, DAX_ALWAYS); + clear_opt(&ctx->opt, DAX_NEVER); return true; case EROFS_MOUNT_DAX_NEVER: - set_opt(ctx, DAX_NEVER); - clear_opt(ctx, DAX_ALWAYS); + set_opt(&ctx->opt, DAX_NEVER); + clear_opt(&ctx->opt, DAX_ALWAYS); return true; default: DBG_BUGON(1); @@ -412,9 +495,10 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode) static int erofs_fc_parse_param(struct fs_context *fc, struct fs_parameter *param) { - struct erofs_fs_context *ctx __maybe_unused = fc->fs_private; + struct erofs_fs_context *ctx = fc->fs_private; struct fs_parse_result result; - int opt; + struct erofs_device_info *dif; + int opt, ret; opt = fs_parse(fc, erofs_fs_parameters, param, &result); if (opt < 0) @@ -424,9 +508,9 @@ static int erofs_fc_parse_param(struct fs_context *fc, case Opt_user_xattr: #ifdef CONFIG_EROFS_FS_XATTR if (result.boolean) - set_opt(ctx, XATTR_USER); + set_opt(&ctx->opt, XATTR_USER); else - clear_opt(ctx, XATTR_USER); + clear_opt(&ctx->opt, XATTR_USER); #else errorfc(fc, "{,no}user_xattr options not supported"); #endif @@ -434,16 +518,16 @@ static int erofs_fc_parse_param(struct fs_context *fc, case Opt_acl: #ifdef CONFIG_EROFS_FS_POSIX_ACL if (result.boolean) - set_opt(ctx, POSIX_ACL); + set_opt(&ctx->opt, POSIX_ACL); else - clear_opt(ctx, POSIX_ACL); + clear_opt(&ctx->opt, POSIX_ACL); #else errorfc(fc, "{,no}acl options not supported"); #endif break; case Opt_cache_strategy: #ifdef CONFIG_EROFS_FS_ZIP - ctx->cache_strategy = result.uint_32; + ctx->opt.cache_strategy = result.uint_32; #else errorfc(fc, "compression not supported, cache_strategy ignored"); #endif @@ -456,6 +540,25 @@ static int erofs_fc_parse_param(struct fs_context *fc, if (!erofs_fc_set_dax_mode(fc, result.uint_32)) return -EINVAL; break; + case Opt_device: + dif = kzalloc(sizeof(*dif), GFP_KERNEL); + if (!dif) + return -ENOMEM; + dif->path = kstrdup(param->string, GFP_KERNEL); + if (!dif->path) { + kfree(dif); + return -ENOMEM; + } + down_write(&ctx->devs->rwsem); + ret = idr_alloc(&ctx->devs->tree, dif, 0, 0, GFP_KERNEL); + up_write(&ctx->devs->rwsem); + if (ret < 0) { + kfree(dif->path); + kfree(dif); + return ret; + } + ++ctx->devs->extra_devices; + break; default: return -ENOPARAM; } @@ -540,15 +643,19 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) return -ENOMEM; sb->s_fs_info = sbi; + sbi->opt = ctx->opt; sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev); + sbi->devs = ctx->devs; + ctx->devs = NULL; + err = erofs_read_superblock(sb); if (err) return err; - if (test_opt(ctx, DAX_ALWAYS) && + if (test_opt(&sbi->opt, DAX_ALWAYS) && !dax_supported(sbi->dax_dev, sb->s_bdev, EROFS_BLKSIZ, 0, bdev_nr_sectors(sb->s_bdev))) { errorfc(fc, "DAX unsupported by block device. Turning off DAX."); - clear_opt(ctx, DAX_ALWAYS); + clear_opt(&sbi->opt, DAX_ALWAYS); } sb->s_flags |= SB_RDONLY | SB_NOATIME; sb->s_maxbytes = MAX_LFS_FILESIZE; @@ -557,13 +664,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_op = &erofs_sops; sb->s_xattr = erofs_xattr_handlers; - if (test_opt(ctx, POSIX_ACL)) + if (test_opt(&sbi->opt, POSIX_ACL)) sb->s_flags |= SB_POSIXACL; else sb->s_flags &= ~SB_POSIXACL; - sbi->ctx = *ctx; - #ifdef CONFIG_EROFS_FS_ZIP xa_init(&sbi->managed_pslots); #endif @@ -607,20 +712,44 @@ static int erofs_fc_reconfigure(struct fs_context *fc) DBG_BUGON(!sb_rdonly(sb)); - if (test_opt(ctx, POSIX_ACL)) + if (test_opt(&ctx->opt, POSIX_ACL)) fc->sb_flags |= SB_POSIXACL; else fc->sb_flags &= ~SB_POSIXACL; - sbi->ctx = *ctx; + sbi->opt = ctx->opt; fc->sb_flags |= SB_RDONLY; return 0; } +static int erofs_release_device_info(int id, void *ptr, void *data) +{ + struct erofs_device_info *dif = ptr; + + fs_put_dax(dif->dax_dev); + if (dif->bdev) + blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL); + kfree(dif->path); + kfree(dif); + return 0; +} + +static void erofs_free_dev_context(struct erofs_dev_context *devs) +{ + if (!devs) + return; + idr_for_each(&devs->tree, &erofs_release_device_info, NULL); + idr_destroy(&devs->tree); + kfree(devs); +} + static void erofs_fc_free(struct fs_context *fc) { - kfree(fc->fs_private); + struct erofs_fs_context *ctx = fc->fs_private; + + erofs_free_dev_context(ctx->devs); + kfree(ctx); } static const struct fs_context_operations erofs_context_ops = { @@ -632,15 +761,21 @@ static const struct fs_context_operations erofs_context_ops = { static int erofs_init_fs_context(struct fs_context *fc) { - fc->fs_private = kzalloc(sizeof(struct erofs_fs_context), GFP_KERNEL); - if (!fc->fs_private) + struct erofs_fs_context *ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + + if (!ctx) return -ENOMEM; + ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL); + if (!ctx->devs) { + kfree(ctx); + return -ENOMEM; + } + fc->fs_private = ctx; - /* set default mount options */ - erofs_default_options(fc->fs_private); - + idr_init(&ctx->devs->tree); + init_rwsem(&ctx->devs->rwsem); + erofs_default_options(ctx); fc->ops = &erofs_context_ops; - return 0; } @@ -659,6 +794,8 @@ static void erofs_kill_sb(struct super_block *sb) sbi = EROFS_SB(sb); if (!sbi) return; + + erofs_free_dev_context(sbi->devs); fs_put_dax(sbi->dax_dev); kfree(sbi); sb->s_fs_info = NULL; @@ -706,6 +843,10 @@ static int __init erofs_module_init(void) if (err) goto shrinker_err; + err = z_erofs_lzma_init(); + if (err) + goto lzma_err; + erofs_pcpubuf_init(); err = z_erofs_init_zip_subsystem(); if (err) @@ -720,6 +861,8 @@ static int __init erofs_module_init(void) fs_err: z_erofs_exit_zip_subsystem(); zip_err: + z_erofs_lzma_exit(); +lzma_err: erofs_exit_shrinker(); shrinker_err: kmem_cache_destroy(erofs_inode_cachep); @@ -730,11 +873,13 @@ icache_err: static void __exit erofs_module_exit(void) { unregister_filesystem(&erofs_fs_type); - z_erofs_exit_zip_subsystem(); - erofs_exit_shrinker(); - /* Ensure all RCU free inodes are safe before cache is destroyed. */ + /* Ensure all RCU free inodes / pclusters are safe to be destroyed. */ rcu_barrier(); + + z_erofs_exit_zip_subsystem(); + z_erofs_lzma_exit(); + erofs_exit_shrinker(); kmem_cache_destroy(erofs_inode_cachep); erofs_pcpubuf_exit(); } @@ -748,7 +893,7 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = sb->s_magic; buf->f_bsize = EROFS_BLKSIZ; - buf->f_blocks = sbi->blocks; + buf->f_blocks = sbi->total_blocks; buf->f_bfree = buf->f_bavail = 0; buf->f_files = ULLONG_MAX; @@ -763,31 +908,31 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) static int erofs_show_options(struct seq_file *seq, struct dentry *root) { struct erofs_sb_info *sbi = EROFS_SB(root->d_sb); - struct erofs_fs_context *ctx = &sbi->ctx; + struct erofs_mount_opts *opt = &sbi->opt; #ifdef CONFIG_EROFS_FS_XATTR - if (test_opt(ctx, XATTR_USER)) + if (test_opt(opt, XATTR_USER)) seq_puts(seq, ",user_xattr"); else seq_puts(seq, ",nouser_xattr"); #endif #ifdef CONFIG_EROFS_FS_POSIX_ACL - if (test_opt(ctx, POSIX_ACL)) + if (test_opt(opt, POSIX_ACL)) seq_puts(seq, ",acl"); else seq_puts(seq, ",noacl"); #endif #ifdef CONFIG_EROFS_FS_ZIP - if (ctx->cache_strategy == EROFS_ZIP_CACHE_DISABLED) + if (opt->cache_strategy == EROFS_ZIP_CACHE_DISABLED) seq_puts(seq, ",cache_strategy=disabled"); - else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAHEAD) + else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAHEAD) seq_puts(seq, ",cache_strategy=readahead"); - else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAROUND) + else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAROUND) seq_puts(seq, ",cache_strategy=readaround"); #endif - if (test_opt(ctx, DAX_ALWAYS)) + if (test_opt(opt, DAX_ALWAYS)) seq_puts(seq, ",dax=always"); - if (test_opt(ctx, DAX_NEVER)) + if (test_opt(opt, DAX_NEVER)) seq_puts(seq, ",dax=never"); return 0; } diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index 3ca703cd5b24..ec9a1d780dc1 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -6,20 +6,29 @@ #include "internal.h" #include -struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp) +struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp) { - struct page *page; + struct page *page = *pagepool; - if (!list_empty(pool)) { - page = lru_to_page(pool); + if (page) { DBG_BUGON(page_ref_count(page) != 1); - list_del(&page->lru); + *pagepool = (struct page *)page_private(page); } else { page = alloc_page(gfp); } return page; } +void erofs_release_pages(struct page **pagepool) +{ + while (*pagepool) { + struct page *page = *pagepool; + + *pagepool = (struct page *)page_private(page); + put_page(page); + } +} + #ifdef CONFIG_EROFS_FS_ZIP /* global shrink count (for all mounted EROFS instances) */ static atomic_long_t erofs_global_shrink_cnt; diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 778f2c52295d..01c581e93c5f 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -429,7 +429,7 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it) static bool erofs_xattr_user_list(struct dentry *dentry) { - return test_opt(&EROFS_SB(dentry->d_sb)->ctx, XATTR_USER); + return test_opt(&EROFS_SB(dentry->d_sb)->opt, XATTR_USER); } static bool erofs_xattr_trusted_list(struct dentry *dentry) @@ -476,7 +476,7 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler, switch (handler->flags) { case EROFS_XATTR_INDEX_USER: - if (!test_opt(&sbi->ctx, XATTR_USER)) + if (!test_opt(&sbi->opt, XATTR_USER)) return -EOPNOTSUPP; break; case EROFS_XATTR_INDEX_TRUSTED: diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index eb51df4a9f77..9a249bfc2770 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -96,16 +96,9 @@ static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl) DBG_BUGON(1); } -/* - * a compressed_pages[] placeholder in order to avoid - * being filled with file pages for in-place decompression. - */ -#define PAGE_UNALLOCATED ((void *)0x5F0E4B1D) - /* how to allocate cached pages for a pcluster */ enum z_erofs_cache_alloctype { DONTALLOC, /* don't allocate any cached pages */ - DELAYEDALLOC, /* delayed allocation (at the time of submitting io) */ /* * try to use cached I/O if page allocation succeeds or fallback * to in-place I/O instead to avoid any direct reclaim. @@ -236,7 +229,7 @@ static DEFINE_MUTEX(z_pagemap_global_lock); static void preload_compressed_pages(struct z_erofs_collector *clt, struct address_space *mc, enum z_erofs_cache_alloctype type, - struct list_head *pagepool) + struct page **pagepool) { struct z_erofs_pcluster *pcl = clt->pcl; bool standalone = true; @@ -267,10 +260,6 @@ static void preload_compressed_pages(struct z_erofs_collector *clt, /* I/O is needed, no possible to decompress directly */ standalone = false; switch (type) { - case DELAYEDALLOC: - t = tagptr_init(compressed_page_t, - PAGE_UNALLOCATED); - break; case TRYALLOC: newpage = erofs_allocpage(pagepool, gfp); if (!newpage) @@ -287,12 +276,10 @@ static void preload_compressed_pages(struct z_erofs_collector *clt, if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t))) continue; - if (page) { + if (page) put_page(page); - } else if (newpage) { - set_page_private(newpage, 0); - list_add(&newpage->lru, pagepool); - } + else if (newpage) + erofs_pagepool_add(pagepool, newpage); } /* @@ -476,6 +463,11 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, struct erofs_workgroup *grp; int err; + if (!(map->m_flags & EROFS_MAP_ENCODED)) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + /* no available pcluster, let's allocate one */ pcl = z_erofs_alloc_pcluster(map->m_plen >> PAGE_SHIFT); if (IS_ERR(pcl)) @@ -483,16 +475,11 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, atomic_set(&pcl->obj.refcount, 1); pcl->obj.index = map->m_pa >> PAGE_SHIFT; - + pcl->algorithmformat = map->m_algorithmformat; pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) | (map->m_flags & EROFS_MAP_FULL_MAPPED ? Z_EROFS_PCLUSTER_FULL_LENGTH : 0); - if (map->m_flags & EROFS_MAP_ZIPPED) - pcl->algorithmformat = Z_EROFS_COMPRESSION_LZ4; - else - pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; - /* new pclusters should be claimed as type 1, primary and followed */ pcl->next = clt->owned_head; clt->mode = COLLECT_PRIMARY_FOLLOWED; @@ -643,7 +630,7 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, } static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, - struct page *page, struct list_head *pagepool) + struct page *page, struct page **pagepool) { struct inode *const inode = fe->inode; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); @@ -695,7 +682,7 @@ restart_now: goto err_out; /* preload all compressed pages (maybe downgrade role if necessary) */ - if (should_alloc_managed_pages(fe, sbi->ctx.cache_strategy, map->m_la)) + if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, map->m_la)) cache_strategy = TRYALLOC; else cache_strategy = DONTALLOC; @@ -797,7 +784,7 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, /* Use workqueue and sync decompression for atomic contexts only */ if (in_atomic() || irqs_disabled()) { queue_work(z_erofs_workqueue, &io->u.work); - sbi->ctx.readahead_sync_decompress = true; + sbi->opt.readahead_sync_decompress = true; return; } z_erofs_decompressqueue_work(&io->u.work); @@ -837,7 +824,7 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) static int z_erofs_decompress_pcluster(struct super_block *sb, struct z_erofs_pcluster *pcl, - struct list_head *pagepool) + struct page **pagepool) { struct erofs_sb_info *const sbi = EROFS_SB(sb); struct z_erofs_pagevec_ctor ctor; @@ -1037,7 +1024,7 @@ out: } static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, - struct list_head *pagepool) + struct page **pagepool) { z_erofs_next_pcluster_t owned = io->head; @@ -1061,18 +1048,18 @@ static void z_erofs_decompressqueue_work(struct work_struct *work) { struct z_erofs_decompressqueue *bgq = container_of(work, struct z_erofs_decompressqueue, u.work); - LIST_HEAD(pagepool); + struct page *pagepool = NULL; DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED); z_erofs_decompress_queue(bgq, &pagepool); - put_pages_list(&pagepool); + erofs_release_pages(&pagepool); kvfree(bgq); } static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, unsigned int nr, - struct list_head *pagepool, + struct page **pagepool, struct address_space *mc, gfp_t gfp) { @@ -1092,15 +1079,6 @@ repeat: if (!page) goto out_allocpage; - /* - * the cached page has not been allocated and - * an placeholder is out there, prepare it now. - */ - if (page == PAGE_UNALLOCATED) { - tocache = true; - goto out_allocpage; - } - /* process the target tagged pointer */ t = tagptr_init(compressed_page_t, page); justfound = tagptr_unfold_tags(t); @@ -1174,7 +1152,7 @@ repeat: out_allocpage: page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) { - list_add(&page->lru, pagepool); + erofs_pagepool_add(pagepool, page); cond_resched(); goto repeat; } @@ -1258,7 +1236,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, static void z_erofs_submit_queue(struct super_block *sb, struct z_erofs_decompress_frontend *f, - struct list_head *pagepool, + struct page **pagepool, struct z_erofs_decompressqueue *fgq, bool *force_fg) { @@ -1267,8 +1245,9 @@ static void z_erofs_submit_queue(struct super_block *sb, struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; void *bi_private; z_erofs_next_pcluster_t owned_head = f->clt.owned_head; - /* since bio will be NULL, no need to initialize last_index */ + /* bio is NULL initially, so no need to initialize last_{index,bdev} */ pgoff_t last_index; + struct block_device *last_bdev; unsigned int nr_bios = 0; struct bio *bio = NULL; @@ -1280,6 +1259,7 @@ static void z_erofs_submit_queue(struct super_block *sb, q[JQ_SUBMIT]->head = owned_head; do { + struct erofs_map_dev mdev; struct z_erofs_pcluster *pcl; pgoff_t cur, end; unsigned int i = 0; @@ -1291,7 +1271,13 @@ static void z_erofs_submit_queue(struct super_block *sb, pcl = container_of(owned_head, struct z_erofs_pcluster, next); - cur = pcl->obj.index; + /* no device id here, thus it will always succeed */ + mdev = (struct erofs_map_dev) { + .m_pa = blknr_to_addr(pcl->obj.index), + }; + (void)erofs_map_dev(sb, &mdev); + + cur = erofs_blknr(mdev.m_pa); end = cur + pcl->pclusterpages; /* close the main owned chain at first */ @@ -1307,7 +1293,8 @@ static void z_erofs_submit_queue(struct super_block *sb, if (!page) continue; - if (bio && cur != last_index + 1) { + if (bio && (cur != last_index + 1 || + last_bdev != mdev.m_bdev)) { submit_bio_retry: submit_bio(bio); bio = NULL; @@ -1315,9 +1302,10 @@ submit_bio_retry: if (!bio) { bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS); - bio->bi_end_io = z_erofs_decompressqueue_endio; - bio_set_dev(bio, sb->s_bdev); + + bio_set_dev(bio, mdev.m_bdev); + last_bdev = mdev.m_bdev; bio->bi_iter.bi_sector = (sector_t)cur << LOG_SECTORS_PER_BLOCK; bio->bi_private = bi_private; @@ -1356,7 +1344,7 @@ submit_bio_retry: static void z_erofs_runqueue(struct super_block *sb, struct z_erofs_decompress_frontend *f, - struct list_head *pagepool, bool force_fg) + struct page **pagepool, bool force_fg) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; @@ -1378,18 +1366,87 @@ static void z_erofs_runqueue(struct super_block *sb, z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool); } +/* + * Since partial uptodate is still unimplemented for now, we have to use + * approximate readmore strategies as a start. + */ +static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, + struct readahead_control *rac, + erofs_off_t end, + struct page **pagepool, + bool backmost) +{ + struct inode *inode = f->inode; + struct erofs_map_blocks *map = &f->map; + erofs_off_t cur; + int err; + + if (backmost) { + map->m_la = end; + err = z_erofs_map_blocks_iter(inode, map, + EROFS_GET_BLOCKS_READMORE); + if (err) + return; + + /* expend ra for the trailing edge if readahead */ + if (rac) { + loff_t newstart = readahead_pos(rac); + + cur = round_up(map->m_la + map->m_llen, PAGE_SIZE); + readahead_expand(rac, newstart, cur - newstart); + return; + } + end = round_up(end, PAGE_SIZE); + } else { + end = round_up(map->m_la, PAGE_SIZE); + + if (!map->m_llen) + return; + } + + cur = map->m_la + map->m_llen - 1; + while (cur >= end) { + pgoff_t index = cur >> PAGE_SHIFT; + struct page *page; + + page = erofs_grab_cache_page_nowait(inode->i_mapping, index); + if (!page) + goto skip; + + if (PageUptodate(page)) { + unlock_page(page); + put_page(page); + goto skip; + } + + err = z_erofs_do_read_page(f, page, pagepool); + if (err) + erofs_err(inode->i_sb, + "readmore error at page %lu @ nid %llu", + index, EROFS_I(inode)->nid); + put_page(page); +skip: + if (cur < PAGE_SIZE) + break; + cur = (index << PAGE_SHIFT) - 1; + } +} + static int z_erofs_readpage(struct file *file, struct page *page) { struct inode *const inode = page->mapping->host; struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); + struct page *pagepool = NULL; int err; - LIST_HEAD(pagepool); trace_erofs_readpage(page, false); - f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; + z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1, + &pagepool, true); err = z_erofs_do_read_page(&f, page, &pagepool); + z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false); + (void)z_erofs_collector_end(&f.clt); /* if some compressed cluster ready, need submit them anyway */ @@ -1401,8 +1458,7 @@ static int z_erofs_readpage(struct file *file, struct page *page) if (f.map.mpage) put_page(f.map.mpage); - /* clean up the remaining free pages */ - put_pages_list(&pagepool); + erofs_release_pages(&pagepool); return err; } @@ -1410,29 +1466,19 @@ static void z_erofs_readahead(struct readahead_control *rac) { struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); - - unsigned int nr_pages = readahead_count(rac); - bool sync = (sbi->ctx.readahead_sync_decompress && - nr_pages <= sbi->ctx.max_sync_decompress_pages); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - struct page *page, *head = NULL; - LIST_HEAD(pagepool); - - trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); + struct page *pagepool = NULL, *head = NULL, *page; + unsigned int nr_pages; f.readahead = true; f.headoffset = readahead_pos(rac); + z_erofs_pcluster_readmore(&f, rac, f.headoffset + + readahead_length(rac) - 1, &pagepool, true); + nr_pages = readahead_count(rac); + trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); + while ((page = readahead_page(rac))) { - prefetchw(&page->flags); - - /* - * A pure asynchronous readahead is indicated if - * a PG_readahead marked page is hitted at first. - * Let's also do asynchronous decompression for this case. - */ - sync &= !(PageReadahead(page) && !head); - set_page_private(page, (unsigned long)head); head = page; } @@ -1451,16 +1497,15 @@ static void z_erofs_readahead(struct readahead_control *rac) page->index, EROFS_I(inode)->nid); put_page(page); } - + z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false); (void)z_erofs_collector_end(&f.clt); - z_erofs_runqueue(inode->i_sb, &f, &pagepool, sync); - + z_erofs_runqueue(inode->i_sb, &f, &pagepool, + sbi->opt.readahead_sync_decompress && + nr_pages <= sbi->opt.max_sync_decompress_pages); if (f.map.mpage) put_page(f.map.mpage); - - /* clean up the remaining free pages */ - put_pages_list(&pagepool); + erofs_release_pages(&pagepool); } const struct address_space_operations z_erofs_aops = { diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 3a008f1b9f78..4a69515dea75 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -94,13 +94,6 @@ struct z_erofs_decompressqueue { } u; }; -#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) -static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, - struct page *page) -{ - return page->mapping == MNGD_MAPPING(sbi); -} - #define Z_EROFS_ONLINEPAGE_COUNT_BITS 2 #define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1) #define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS) @@ -186,4 +179,3 @@ static inline void z_erofs_onlinepage_endio(struct page *page) #define Z_EROFS_VMAP_GLOBAL_PAGES 2048 #endif - diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 7a6df35fdc91..660489a7fb64 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -28,7 +28,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); struct super_block *const sb = inode->i_sb; - int err; + int err, headnr; erofs_off_t pos; struct page *page; void *kaddr; @@ -68,9 +68,11 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) vi->z_algorithmtype[0] = h->h_algorithmtype & 15; vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; - if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX) { - erofs_err(sb, "unknown compression format %u for nid %llu, please upgrade kernel", - vi->z_algorithmtype[0], vi->nid); + headnr = 0; + if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX || + vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) { + erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel", + headnr + 1, vi->z_algorithmtype[headnr], vi->nid); err = -EOPNOTSUPP; goto unmap_done; } @@ -111,7 +113,7 @@ struct z_erofs_maprecorder { unsigned long lcn; /* compression extent information gathered */ - u8 type; + u8 type, headtype; u16 clusterofs; u16 delta[2]; erofs_blk_t pblk, compressedlcs; @@ -178,7 +180,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, m->clusterofs = 1 << vi->z_logical_clusterbits; m->delta[0] = le16_to_cpu(di->di_u.delta[0]); if (m->delta[0] & Z_EROFS_VLE_DI_D0_CBLKCNT) { - if (!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) { + if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | + Z_EROFS_ADVISE_BIG_PCLUSTER_2))) { DBG_BUGON(1); return -EFSCORRUPTED; } @@ -189,7 +192,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, m->delta[1] = le16_to_cpu(di->di_u.delta[1]); break; case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: m->clusterofs = le16_to_cpu(di->di_clusterofs); m->pblk = le32_to_cpu(di->di_u.blkaddr); break; @@ -446,9 +450,9 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, } return z_erofs_extent_lookback(m, m->delta[0]); case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - map->m_flags &= ~EROFS_MAP_ZIPPED; - fallthrough; - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: + m->headtype = m->type; map->m_la = (lcn << lclusterbits) | m->clusterofs; break; default: @@ -471,13 +475,18 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, int err; DBG_BUGON(m->type != Z_EROFS_VLE_CLUSTER_TYPE_PLAIN && - m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD); - if (!(map->m_flags & EROFS_MAP_ZIPPED) || - !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) { + m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 && + m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD2); + DBG_BUGON(m->type != m->headtype); + + if (m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN || + ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1) && + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) || + ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) && + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) { map->m_plen = 1 << lclusterbits; return 0; } - lcn = m->lcn + 1; if (m->compressedlcs) goto out; @@ -499,7 +508,8 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, switch (m->type) { case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: /* * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type * rather than CBLKCNT, it's a 1 lcluster-sized pcluster. @@ -554,7 +564,8 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) DBG_BUGON(!m->delta[1] && m->clusterofs != 1 << lclusterbits); } else if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN || - m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD) { + m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 || + m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) { /* go on until the next HEAD lcluster */ if (lcn != headlcn) break; @@ -609,16 +620,15 @@ int z_erofs_map_blocks_iter(struct inode *inode, if (err) goto unmap_out; - map->m_flags = EROFS_MAP_ZIPPED; /* by default, compressed */ + map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED; end = (m.lcn + 1ULL) << lclusterbits; switch (m.type) { case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - if (endoff >= m.clusterofs) - map->m_flags &= ~EROFS_MAP_ZIPPED; - fallthrough; - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: if (endoff >= m.clusterofs) { + m.headtype = m.type; map->m_la = (m.lcn << lclusterbits) | m.clusterofs; break; } @@ -650,13 +660,22 @@ int z_erofs_map_blocks_iter(struct inode *inode, map->m_llen = end - map->m_la; map->m_pa = blknr_to_addr(m.pblk); - map->m_flags |= EROFS_MAP_MAPPED; err = z_erofs_get_extent_compressedlen(&m, initial_lcn); if (err) goto out; - if (flags & EROFS_GET_BLOCKS_FIEMAP) { + if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) + map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; + else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) + map->m_algorithmformat = vi->z_algorithmtype[1]; + else + map->m_algorithmformat = vi->z_algorithmtype[0]; + + if ((flags & EROFS_GET_BLOCKS_FIEMAP) || + ((flags & EROFS_GET_BLOCKS_READMORE) && + map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA && + map->m_llen >= EROFS_BLKSIZ)) { err = z_erofs_get_extent_decompressedlen(&m); if (!err) map->m_flags |= EROFS_MAP_FULL_MAPPED; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8a67e5f3f576..6361ea1f97bc 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1572,7 +1572,6 @@ static const struct fscrypt_operations ext4_cryptops = { .set_context = ext4_set_context, .get_dummy_policy = ext4_get_dummy_policy, .empty_dir = ext4_empty_dir, - .max_namelen = EXT4_NAME_LEN, .has_stable_inodes = ext4_has_stable_inodes, .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits, }; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 7b0282724231..f1693d45bb78 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -653,7 +653,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) return PTR_ERR(inode); } - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) { iput(inode); goto err_out; @@ -705,9 +705,6 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) } #ifdef CONFIG_QUOTA - /* Needed for iput() to work correctly and not trash data */ - sbi->sb->s_flags |= SB_ACTIVE; - /* * Turn on quotas which were not enabled for read-only mounts if * filesystem has quota feature, so that they are updated correctly. diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 9b663eaf4805..7588e4e817b8 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -881,6 +881,25 @@ bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index) return is_page_in_cluster(cc, index); } +bool f2fs_all_cluster_page_loaded(struct compress_ctx *cc, struct pagevec *pvec, + int index, int nr_pages) +{ + unsigned long pgidx; + int i; + + if (nr_pages - index < cc->cluster_size) + return false; + + pgidx = pvec->pages[index]->index; + + for (i = 1; i < cc->cluster_size; i++) { + if (pvec->pages[index + i]->index != pgidx + i) + return false; + } + + return true; +} + static bool cluster_has_invalid_data(struct compress_ctx *cc) { loff_t i_size = i_size_read(cc->inode); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 005d6144e5f3..3bd0cc2c1d2e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1470,10 +1470,15 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, struct extent_info ei = {0, }; block_t blkaddr; unsigned int start_pgofs; + int bidx = 0; if (!maxblocks) return 0; + map->m_bdev = inode->i_sb->s_bdev; + map->m_multidev_dio = + f2fs_allow_multi_device_dio(F2FS_I_SB(inode), flag); + map->m_len = 0; map->m_flags = 0; @@ -1496,6 +1501,21 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, if (flag == F2FS_GET_BLOCK_DIO) f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); + + if (map->m_multidev_dio) { + block_t blk_addr = map->m_pblk; + + bidx = f2fs_target_device_index(sbi, map->m_pblk); + + map->m_bdev = FDEV(bidx).bdev; + map->m_pblk -= FDEV(bidx).start_blk; + map->m_len = min(map->m_len, + FDEV(bidx).end_blk + 1 - map->m_pblk); + + if (map->m_may_create) + f2fs_update_device_state(sbi, inode->i_ino, + blk_addr, map->m_len); + } goto out; } @@ -1614,6 +1634,9 @@ next_block: if (flag == F2FS_GET_BLOCK_PRE_AIO) goto skip; + if (map->m_multidev_dio) + bidx = f2fs_target_device_index(sbi, blkaddr); + if (map->m_len == 0) { /* preallocated unwritten block should be mapped for fiemap. */ if (blkaddr == NEW_ADDR) @@ -1622,10 +1645,15 @@ next_block: map->m_pblk = blkaddr; map->m_len = 1; + + if (map->m_multidev_dio) + map->m_bdev = FDEV(bidx).bdev; } else if ((map->m_pblk != NEW_ADDR && blkaddr == (map->m_pblk + ofs)) || (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) || flag == F2FS_GET_BLOCK_PRE_DIO) { + if (map->m_multidev_dio && map->m_bdev != FDEV(bidx).bdev) + goto sync_out; ofs++; map->m_len++; } else { @@ -1678,10 +1706,32 @@ skip: sync_out: - /* for hardware encryption, but to avoid potential issue in future */ - if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) + if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) { + /* + * for hardware encryption, but to avoid potential issue + * in future + */ f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); + invalidate_mapping_pages(META_MAPPING(sbi), + map->m_pblk, map->m_pblk); + + if (map->m_multidev_dio) { + block_t blk_addr = map->m_pblk; + + bidx = f2fs_target_device_index(sbi, map->m_pblk); + + map->m_bdev = FDEV(bidx).bdev; + map->m_pblk -= FDEV(bidx).start_blk; + + if (map->m_may_create) + f2fs_update_device_state(sbi, inode->i_ino, + blk_addr, map->m_len); + + f2fs_bug_on(sbi, blk_addr + map->m_len > + FDEV(bidx).end_blk + 1); + } + } if (flag == F2FS_GET_BLOCK_PRECACHE) { if (map->m_flags & F2FS_MAP_MAPPED) { @@ -1701,7 +1751,7 @@ unlock_out: f2fs_balance_fs(sbi, dn.node_changed); } out: - trace_f2fs_map_blocks(inode, map, err); + trace_f2fs_map_blocks(inode, map, create, flag, err); return err; } @@ -1760,6 +1810,9 @@ static int __get_data_block(struct inode *inode, sector_t iblock, map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; bh->b_size = blks_to_bytes(inode, map.m_len); + + if (map.m_multidev_dio) + bh->b_bdev = map.m_bdev; } return err; } @@ -2994,6 +3047,10 @@ readd: need_readd = false; #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { + void *fsdata = NULL; + struct page *pagep; + int ret2; + ret = f2fs_init_compress_ctx(&cc); if (ret) { done = 1; @@ -3012,27 +3069,23 @@ readd: if (unlikely(f2fs_cp_error(sbi))) goto lock_page; - if (f2fs_cluster_is_empty(&cc)) { - void *fsdata = NULL; - struct page *pagep; - int ret2; + if (!f2fs_cluster_is_empty(&cc)) + goto lock_page; - ret2 = f2fs_prepare_compress_overwrite( + ret2 = f2fs_prepare_compress_overwrite( inode, &pagep, page->index, &fsdata); - if (ret2 < 0) { - ret = ret2; - done = 1; - break; - } else if (ret2 && - !f2fs_compress_write_end(inode, - fsdata, page->index, - 1)) { - retry = 1; - break; - } - } else { - goto lock_page; + if (ret2 < 0) { + ret = ret2; + done = 1; + break; + } else if (ret2 && + (!f2fs_compress_write_end(inode, + fsdata, page->index, 1) || + !f2fs_all_cluster_page_loaded(&cc, + &pvec, i, nr_pages))) { + retry = 1; + break; } } #endif diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 00d3e7f330fc..64bcb81ec1c6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -55,6 +55,7 @@ enum { FAULT_DISCARD, FAULT_WRITE_IO, FAULT_SLAB_ALLOC, + FAULT_DQUOT_INIT, FAULT_MAX, }; @@ -561,6 +562,9 @@ enum { #define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ +/* dirty segments threshold for triggering CP */ +#define DEFAULT_DIRTY_THRESHOLD 4 + /* for in-memory extent cache entry */ #define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */ @@ -617,6 +621,7 @@ struct extent_tree { F2FS_MAP_UNWRITTEN) struct f2fs_map_blocks { + struct block_device *m_bdev; /* for multi-device dio */ block_t m_pblk; block_t m_lblk; unsigned int m_len; @@ -625,6 +630,7 @@ struct f2fs_map_blocks { pgoff_t *m_next_extent; /* point to next possible extent */ int m_seg_type; bool m_may_create; /* indicate it is from write path */ + bool m_multidev_dio; /* indicate it allows multi-device dio */ }; /* for flag in get_data_block */ @@ -1284,8 +1290,10 @@ enum { }; enum { - FS_MODE_ADAPTIVE, /* use both lfs/ssr allocation */ - FS_MODE_LFS, /* use lfs allocation only */ + FS_MODE_ADAPTIVE, /* use both lfs/ssr allocation */ + FS_MODE_LFS, /* use lfs allocation only */ + FS_MODE_FRAGMENT_SEG, /* segment fragmentation mode */ + FS_MODE_FRAGMENT_BLK, /* block fragmentation mode */ }; enum { @@ -1728,12 +1736,15 @@ struct f2fs_sb_info { /* For shrinker support */ struct list_head s_list; + struct mutex umount_mutex; + unsigned int shrinker_run_no; + + /* For multi devices */ int s_ndevs; /* number of devices */ struct f2fs_dev_info *devs; /* for device list */ unsigned int dirty_device; /* for checkpoint data flush */ spinlock_t dev_lock; /* protect dirty_device */ - struct mutex umount_mutex; - unsigned int shrinker_run_no; + bool aligned_blksize; /* all devices has the same logical blksize */ /* For write statistics */ u64 sectors_written_start; @@ -1756,6 +1767,9 @@ struct f2fs_sb_info { unsigned long seq_file_ra_mul; /* multiplier for ra_pages of seq. files in fadvise */ + int max_fragment_chunk; /* max chunk size for block fragmentation mode */ + int max_fragment_hole; /* max hole size for block fragmentation mode */ + #ifdef CONFIG_F2FS_FS_COMPRESSION struct kmem_cache *page_array_slab; /* page array entry */ unsigned int page_array_slab_size; /* default page array slab size */ @@ -3363,6 +3377,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) */ int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); +int f2fs_dquot_initialize(struct inode *inode); int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); int f2fs_quota_sync(struct super_block *sb, int type); loff_t max_file_blocks(struct inode *inode); @@ -3492,6 +3507,8 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, struct f2fs_io_info *fio); +void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, + block_t blkaddr, unsigned int blkcnt); void f2fs_wait_on_page_writeback(struct page *page, enum page_type type, bool ordered, bool locked); void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr); @@ -3516,6 +3533,16 @@ unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi, unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, unsigned int segno); +#define DEF_FRAGMENT_SIZE 4 +#define MIN_FRAGMENT_SIZE 1 +#define MAX_FRAGMENT_SIZE 512 + +static inline bool f2fs_need_rand_seg(struct f2fs_sb_info *sbi) +{ + return F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_SEG || + F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK; +} + /* * checkpoint.c */ @@ -4027,6 +4054,8 @@ void f2fs_end_read_compressed_page(struct page *page, bool failed, block_t blkaddr); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); +bool f2fs_all_cluster_page_loaded(struct compress_ctx *cc, struct pagevec *pvec, + int index, int nr_pages); bool f2fs_sanity_check_cluster(struct dnode_of_data *dn); void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page); int f2fs_write_multi_pages(struct compress_ctx *cc, @@ -4301,6 +4330,16 @@ static inline int block_unaligned_IO(struct inode *inode, return align & blocksize_mask; } +static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi, + int flag) +{ + if (!f2fs_is_multi_device(sbi)) + return false; + if (flag != F2FS_GET_BLOCK_DIO) + return false; + return sbi->aligned_blksize; +} + static inline bool f2fs_force_buffered_io(struct inode *inode, struct kiocb *iocb, struct iov_iter *iter) { @@ -4313,7 +4352,9 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, return true; if (f2fs_compressed_file(inode)) return true; - if (f2fs_is_multi_device(sbi)) + + /* disallow direct IO if any of devices has unaligned blksize */ + if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize) return true; /* * for blkzoned device, fallback direct IO to buffered IO, so diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9c8ef33bd8d3..abe7edc82582 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -786,7 +786,7 @@ int f2fs_truncate(struct inode *inode) return -EIO; } - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) return err; @@ -916,7 +916,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, return err; if (is_quota_modification(inode, attr)) { - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) return err; } @@ -3020,7 +3020,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) } f2fs_put_page(ipage, 1); - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) return err; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 77391e3b7d68..a946ce0ead34 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" @@ -257,7 +258,9 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, p->max_search = sbi->max_victim_search; /* let's select beginning hot/small space first in no_heap mode*/ - if (test_opt(sbi, NOHEAP) && + if (f2fs_need_rand_seg(sbi)) + p->offset = prandom_u32() % (MAIN_SECS(sbi) * sbi->segs_per_sec); + else if (test_opt(sbi, NOHEAP) && (type == CURSEG_HOT_DATA || IS_NODESEG(type))) p->offset = 0; else diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 2311c76ffc37..928fea695373 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -210,7 +210,7 @@ int f2fs_convert_inline_inode(struct inode *inode) f2fs_hw_is_readonly(sbi) || f2fs_readonly(sbi->sb)) return 0; - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) return err; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 1213f15ffd68..0f8b2df3e1e0 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -754,7 +754,7 @@ void f2fs_evict_inode(struct inode *inode) if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) { err = 0; set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index ae0838001480..a728a0af9ce0 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -74,7 +74,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (err) goto fail_drop; - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) goto fail_drop; @@ -345,7 +345,7 @@ static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir, if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) return err; @@ -404,7 +404,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, F2FS_I(old_dentry->d_inode)->i_projid))) return -EXDEV; - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) return err; @@ -460,7 +460,7 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) return 0; } - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) return err; @@ -598,10 +598,10 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) goto fail; } - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) goto fail; - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) goto fail; @@ -675,7 +675,7 @@ static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) return err; @@ -746,7 +746,7 @@ static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, if (unlikely(f2fs_cp_error(sbi))) return -EIO; - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) return err; @@ -803,7 +803,7 @@ static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir, if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) return err; @@ -841,7 +841,7 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, struct inode *inode; int err; - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) return err; @@ -965,16 +965,16 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, return err; } - err = dquot_initialize(old_dir); + err = f2fs_dquot_initialize(old_dir); if (err) goto out; - err = dquot_initialize(new_dir); + err = f2fs_dquot_initialize(new_dir); if (err) goto out; if (new_inode) { - err = dquot_initialize(new_inode); + err = f2fs_dquot_initialize(new_inode); if (err) goto out; } @@ -1138,11 +1138,11 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, F2FS_I(new_dentry->d_inode)->i_projid))) return -EXDEV; - err = dquot_initialize(old_dir); + err = f2fs_dquot_initialize(old_dir); if (err) goto out; - err = dquot_initialize(new_dir); + err = f2fs_dquot_initialize(new_dir); if (err) goto out; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index ff14a6e5ac1c..18b98cf0465b 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -138,11 +138,6 @@ static inline bool excess_cached_nats(struct f2fs_sb_info *sbi) return NM_I(sbi)->nat_cnt[TOTAL_NAT] >= DEF_NAT_CACHE_THRESHOLD; } -static inline bool excess_dirty_nodes(struct f2fs_sb_info *sbi) -{ - return get_pages(sbi, F2FS_DIRTY_NODES) >= sbi->blocks_per_seg * 8; -} - enum mem_type { FREE_NIDS, /* indicates the free nid list */ NAT_ENTRIES, /* indicates the cached nat entry */ diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 04655511d7f5..6a1b4668d933 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -81,7 +81,7 @@ static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi, if (IS_ERR(inode)) return ERR_CAST(inode); - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) goto err_out; @@ -203,7 +203,7 @@ retry: goto out_put; } - err = dquot_initialize(einode); + err = f2fs_dquot_initialize(einode); if (err) { iput(einode); goto out_put; @@ -508,7 +508,7 @@ got_it: if (IS_ERR(inode)) return PTR_ERR(inode); - ret = dquot_initialize(inode); + ret = f2fs_dquot_initialize(inode); if (ret) { iput(inode); return ret; @@ -787,8 +787,6 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) } #ifdef CONFIG_QUOTA - /* Needed for iput() to work correctly and not trash data */ - sbi->sb->s_flags |= SB_ACTIVE; /* Turn on quotas so that they are updated correctly */ quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY); #endif @@ -816,10 +814,8 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) err = recover_data(sbi, &inode_list, &tmp_inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); - else { - /* restore s_flags to let iput() trash data */ - sbi->sb->s_flags = s_flags; - } + else + f2fs_bug_on(sbi, sbi->sb->s_flags & SB_ACTIVE); skip: fix_curseg_write_pointer = !check_only || list_empty(&inode_list); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d716553bdc02..df9ed75f0b7a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "f2fs.h" #include "segment.h" @@ -529,6 +530,25 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) } } +static inline bool excess_dirty_threshold(struct f2fs_sb_info *sbi) +{ + int factor = rwsem_is_locked(&sbi->cp_rwsem) ? 3 : 2; + unsigned int dents = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int qdata = get_pages(sbi, F2FS_DIRTY_QDATA); + unsigned int nodes = get_pages(sbi, F2FS_DIRTY_NODES); + unsigned int meta = get_pages(sbi, F2FS_DIRTY_META); + unsigned int imeta = get_pages(sbi, F2FS_DIRTY_IMETA); + unsigned int threshold = sbi->blocks_per_seg * factor * + DEFAULT_DIRTY_THRESHOLD; + unsigned int global_threshold = threshold * 3 / 2; + + if (dents >= threshold || qdata >= threshold || + nodes >= threshold || meta >= threshold || + imeta >= threshold) + return true; + return dents + qdata + nodes + meta + imeta > global_threshold; +} + void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) { if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) @@ -547,8 +567,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) else f2fs_build_free_nids(sbi, false, false); - if (excess_dirty_nats(sbi) || excess_dirty_nodes(sbi) || - excess_prefree_segs(sbi)) + if (excess_dirty_nats(sbi) || excess_dirty_threshold(sbi) || + excess_prefree_segs(sbi) || !f2fs_space_for_roll_forward(sbi)) goto do_sync; /* there is background inflight IO or foreground operation recently */ @@ -2630,6 +2650,8 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) unsigned short seg_type = curseg->seg_type; sanity_check_seg_type(sbi, seg_type); + if (f2fs_need_rand_seg(sbi)) + return prandom_u32() % (MAIN_SECS(sbi) * sbi->segs_per_sec); /* if segs_per_sec is large than 1, we need to keep original policy. */ if (__is_large_section(sbi)) @@ -2681,6 +2703,9 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) curseg->next_segno = segno; reset_curseg(sbi, type, 1); curseg->alloc_type = LFS; + if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) + curseg->fragment_remained_chunk = + prandom_u32() % sbi->max_fragment_chunk + 1; } static int __next_free_blkoff(struct f2fs_sb_info *sbi, @@ -2707,12 +2732,22 @@ static int __next_free_blkoff(struct f2fs_sb_info *sbi, static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, struct curseg_info *seg) { - if (seg->alloc_type == SSR) + if (seg->alloc_type == SSR) { seg->next_blkoff = __next_free_blkoff(sbi, seg->segno, seg->next_blkoff + 1); - else + } else { seg->next_blkoff++; + if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) { + /* To allocate block chunks in different sizes, use random number */ + if (--seg->fragment_remained_chunk <= 0) { + seg->fragment_remained_chunk = + prandom_u32() % sbi->max_fragment_chunk + 1; + seg->next_blkoff += + prandom_u32() % sbi->max_fragment_hole + 1; + } + } + } } bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno) @@ -3485,24 +3520,30 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, up_read(&SM_I(sbi)->curseg_lock); } -static void update_device_state(struct f2fs_io_info *fio) +void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, + block_t blkaddr, unsigned int blkcnt) { - struct f2fs_sb_info *sbi = fio->sbi; - unsigned int devidx; - if (!f2fs_is_multi_device(sbi)) return; - devidx = f2fs_target_device_index(sbi, fio->new_blkaddr); + while (1) { + unsigned int devidx = f2fs_target_device_index(sbi, blkaddr); + unsigned int blks = FDEV(devidx).end_blk - blkaddr + 1; - /* update device state for fsync */ - f2fs_set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO); + /* update device state for fsync */ + f2fs_set_dirty_device(sbi, ino, devidx, FLUSH_INO); - /* update device state for checkpoint */ - if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) { - spin_lock(&sbi->dev_lock); - f2fs_set_bit(devidx, (char *)&sbi->dirty_device); - spin_unlock(&sbi->dev_lock); + /* update device state for checkpoint */ + if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) { + spin_lock(&sbi->dev_lock); + f2fs_set_bit(devidx, (char *)&sbi->dirty_device); + spin_unlock(&sbi->dev_lock); + } + + if (blkcnt <= blks) + break; + blkcnt -= blks; + blkaddr += blks; } } @@ -3529,7 +3570,7 @@ reallocate: goto reallocate; } - update_device_state(fio); + f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1); if (keep_order) up_read(&fio->sbi->io_order_lock); @@ -3611,6 +3652,9 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) goto drop_bio; } + invalidate_mapping_pages(META_MAPPING(sbi), + fio->new_blkaddr, fio->new_blkaddr); + stat_inc_inplace_blocks(fio->sbi); if (fio->bio && !(SM_I(sbi)->ipu_policy & (1 << F2FS_IPU_NOCACHE))) @@ -3618,7 +3662,8 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) else err = f2fs_submit_page_bio(fio); if (!err) { - update_device_state(fio); + f2fs_update_device_state(fio->sbi, fio->ino, + fio->new_blkaddr, 1); f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 89fff258727d..46fde9f3f28e 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -314,6 +314,7 @@ struct curseg_info { unsigned short next_blkoff; /* next block offset to write */ unsigned int zone; /* current zone number */ unsigned int next_segno; /* preallocated segment */ + int fragment_remained_chunk; /* remained block size in a chunk for block fragmentation mode */ bool inited; /* indicate inmem log is inited */ }; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8795a5a8d4e8..7960ce066c1b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -58,6 +58,7 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_DISCARD] = "discard error", [FAULT_WRITE_IO] = "write IO error", [FAULT_SLAB_ALLOC] = "slab alloc", + [FAULT_DQUOT_INIT] = "dquot initialize", }; void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, @@ -817,6 +818,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE; } else if (!strcmp(name, "lfs")) { F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; + } else if (!strcmp(name, "fragment:segment")) { + F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_SEG; + } else if (!strcmp(name, "fragment:block")) { + F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_BLK; } else { kfree(name); return -EINVAL; @@ -1896,6 +1901,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, "adaptive"); else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS) seq_puts(seq, "lfs"); + else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_SEG) + seq_puts(seq, "fragment:segment"); + else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) + seq_puts(seq, "fragment:block"); seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs); if (test_opt(sbi, RESERVE_ROOT)) seq_printf(seq, ",reserve_root=%u,resuid=%u,resgid=%u", @@ -2491,6 +2500,16 @@ retry: return len - towrite; } +int f2fs_dquot_initialize(struct inode *inode) +{ + if (time_to_inject(F2FS_I_SB(inode), FAULT_DQUOT_INIT)) { + f2fs_show_injection_info(F2FS_I_SB(inode), FAULT_DQUOT_INIT); + return -ESRCH; + } + + return dquot_initialize(inode); +} + static struct dquot **f2fs_get_dquots(struct inode *inode) { return F2FS_I(inode)->i_dquot; @@ -2875,6 +2894,11 @@ static const struct quotactl_ops f2fs_quotactl_ops = { .get_nextdqblk = dquot_get_next_dqblk, }; #else +int f2fs_dquot_initialize(struct inode *inode) +{ + return 0; +} + int f2fs_quota_sync(struct super_block *sb, int type) { return 0; @@ -2976,7 +3000,6 @@ static const struct fscrypt_operations f2fs_cryptops = { .set_context = f2fs_set_context, .get_dummy_policy = f2fs_get_dummy_policy, .empty_dir = f2fs_empty_dir, - .max_namelen = F2FS_NAME_LEN, .has_stable_inodes = f2fs_has_stable_inodes, .get_ino_and_lblk_bits = f2fs_get_ino_and_lblk_bits, .get_num_devices = f2fs_get_num_devices, @@ -3523,6 +3546,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; sbi->migration_granularity = sbi->segs_per_sec; sbi->seq_file_ra_mul = MIN_RA_MUL; + sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE; + sbi->max_fragment_hole = DEF_FRAGMENT_SIZE; sbi->dir_level = DEF_DIR_LEVEL; sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; @@ -3747,6 +3772,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); unsigned int max_devices = MAX_DEVICES; + unsigned int logical_blksize; int i; /* Initialize single device information */ @@ -3767,6 +3793,9 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) if (!sbi->devs) return -ENOMEM; + logical_blksize = bdev_logical_block_size(sbi->sb->s_bdev); + sbi->aligned_blksize = true; + for (i = 0; i < max_devices; i++) { if (i > 0 && !RDEV(i).path[0]) @@ -3803,6 +3832,9 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) /* to release errored devices */ sbi->s_ndevs = i + 1; + if (logical_blksize != bdev_logical_block_size(FDEV(i).bdev)) + sbi->aligned_blksize = false; + #ifdef CONFIG_BLK_DEV_ZONED if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM && !f2fs_sb_has_blkzoned(sbi)) { diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index a32fe31c33b8..7d289249cd7e 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -196,7 +196,7 @@ static ssize_t encoding_show(struct f2fs_attr *a, struct super_block *sb = sbi->sb; if (f2fs_sb_has_casefold(sbi)) - return snprintf(buf, PAGE_SIZE, "%s (%d.%d.%d)\n", + return sysfs_emit(buf, "%s (%d.%d.%d)\n", sb->s_encoding->charset, (sb->s_encoding->version >> 16) & 0xff, (sb->s_encoding->version >> 8) & 0xff, @@ -245,7 +245,7 @@ static ssize_t avg_vblocks_show(struct f2fs_attr *a, static ssize_t main_blkaddr_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return snprintf(buf, PAGE_SIZE, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)MAIN_BLKADDR(sbi)); } @@ -551,6 +551,22 @@ out: return count; } + if (!strcmp(a->attr.name, "max_fragment_chunk")) { + if (t >= MIN_FRAGMENT_SIZE && t <= MAX_FRAGMENT_SIZE) + sbi->max_fragment_chunk = t; + else + return -EINVAL; + return count; + } + + if (!strcmp(a->attr.name, "max_fragment_hole")) { + if (t >= MIN_FRAGMENT_SIZE && t <= MAX_FRAGMENT_SIZE) + sbi->max_fragment_hole = t; + else + return -EINVAL; + return count; + } + *ui = (unsigned int)t; return count; @@ -781,6 +797,8 @@ F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_threshold, age_threshold); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, seq_file_ra_mul, seq_file_ra_mul); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_segment_mode, gc_segment_mode); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_reclaimed_segments, gc_reclaimed_segs); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_chunk, max_fragment_chunk); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_hole, max_fragment_hole); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -859,6 +877,8 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(seq_file_ra_mul), ATTR_LIST(gc_segment_mode), ATTR_LIST(gc_reclaimed_segments), + ATTR_LIST(max_fragment_chunk), + ATTR_LIST(max_fragment_hole), NULL, }; ATTRIBUTE_GROUPS(f2fs); diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 03549b5ba204..fe5acdccaae1 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -136,7 +136,7 @@ static int f2fs_begin_enable_verity(struct file *filp) * here and not rely on ->open() doing it. This must be done before * evicting the inline data. */ - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) return err; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 1d2d29dcd41c..e348f33bcb2b 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -773,7 +773,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) return err; diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c index 22be7aeb96c4..c57b46a352d8 100644 --- a/fs/ubifs/crypto.c +++ b/fs/ubifs/crypto.c @@ -82,5 +82,4 @@ const struct fscrypt_operations ubifs_crypt_operations = { .get_context = ubifs_crypt_get_context, .set_context = ubifs_crypt_set_context, .empty_dir = ubifs_crypt_empty_dir, - .max_namelen = UBIFS_MAX_NLEN, }; diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index df8fb8b38541..f3100eb3d9bc 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -118,9 +118,6 @@ struct fscrypt_operations { */ bool (*empty_dir)(struct inode *inode); - /* The filesystem's maximum ciphertext filename length, in bytes */ - unsigned int max_namelen; - /* * Check whether the filesystem's inode numbers and UUID are stable, * meaning that they will never be changed even by offline operations diff --git a/include/linux/xz.h b/include/linux/xz.h index 9884c8440188..7285ca5d56e9 100644 --- a/include/linux/xz.h +++ b/include/linux/xz.h @@ -233,6 +233,112 @@ XZ_EXTERN void xz_dec_reset(struct xz_dec *s); */ XZ_EXTERN void xz_dec_end(struct xz_dec *s); +/* + * Decompressor for MicroLZMA, an LZMA variant with a very minimal header. + * See xz_dec_microlzma_alloc() below for details. + * + * These functions aren't used or available in preboot code and thus aren't + * marked with XZ_EXTERN. This avoids warnings about static functions that + * are never defined. + */ +/** + * struct xz_dec_microlzma - Opaque type to hold the MicroLZMA decoder state + */ +struct xz_dec_microlzma; + +/** + * xz_dec_microlzma_alloc() - Allocate memory for the MicroLZMA decoder + * @mode XZ_SINGLE or XZ_PREALLOC + * @dict_size LZMA dictionary size. This must be at least 4 KiB and + * at most 3 GiB. + * + * In contrast to xz_dec_init(), this function only allocates the memory + * and remembers the dictionary size. xz_dec_microlzma_reset() must be used + * before calling xz_dec_microlzma_run(). + * + * The amount of allocated memory is a little less than 30 KiB with XZ_SINGLE. + * With XZ_PREALLOC also a dictionary buffer of dict_size bytes is allocated. + * + * On success, xz_dec_microlzma_alloc() returns a pointer to + * struct xz_dec_microlzma. If memory allocation fails or + * dict_size is invalid, NULL is returned. + * + * The compressed format supported by this decoder is a raw LZMA stream + * whose first byte (always 0x00) has been replaced with bitwise-negation + * of the LZMA properties (lc/lp/pb) byte. For example, if lc/lp/pb is + * 3/0/2, the first byte is 0xA2. This way the first byte can never be 0x00. + * Just like with LZMA2, lc + lp <= 4 must be true. The LZMA end-of-stream + * marker must not be used. The unused values are reserved for future use. + * This MicroLZMA header format was created for use in EROFS but may be used + * by others too. + */ +extern struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode, + uint32_t dict_size); + +/** + * xz_dec_microlzma_reset() - Reset the MicroLZMA decoder state + * @s Decoder state allocated using xz_dec_microlzma_alloc() + * @comp_size Compressed size of the input stream + * @uncomp_size Uncompressed size of the input stream. A value smaller + * than the real uncompressed size of the input stream can + * be specified if uncomp_size_is_exact is set to false. + * uncomp_size can never be set to a value larger than the + * expected real uncompressed size because it would eventually + * result in XZ_DATA_ERROR. + * @uncomp_size_is_exact This is an int instead of bool to avoid + * requiring stdbool.h. This should normally be set to true. + * When this is set to false, error detection is weaker. + */ +extern void xz_dec_microlzma_reset(struct xz_dec_microlzma *s, + uint32_t comp_size, uint32_t uncomp_size, + int uncomp_size_is_exact); + +/** + * xz_dec_microlzma_run() - Run the MicroLZMA decoder + * @s Decoder state initialized using xz_dec_microlzma_reset() + * @b: Input and output buffers + * + * This works similarly to xz_dec_run() with a few important differences. + * Only the differences are documented here. + * + * The only possible return values are XZ_OK, XZ_STREAM_END, and + * XZ_DATA_ERROR. This function cannot return XZ_BUF_ERROR: if no progress + * is possible due to lack of input data or output space, this function will + * keep returning XZ_OK. Thus, the calling code must be written so that it + * will eventually provide input and output space matching (or exceeding) + * comp_size and uncomp_size arguments given to xz_dec_microlzma_reset(). + * If the caller cannot do this (for example, if the input file is truncated + * or otherwise corrupt), the caller must detect this error by itself to + * avoid an infinite loop. + * + * If the compressed data seems to be corrupt, XZ_DATA_ERROR is returned. + * This can happen also when incorrect dictionary, uncompressed, or + * compressed sizes have been specified. + * + * With XZ_PREALLOC only: As an extra feature, b->out may be NULL to skip over + * uncompressed data. This way the caller doesn't need to provide a temporary + * output buffer for the bytes that will be ignored. + * + * With XZ_SINGLE only: In contrast to xz_dec_run(), the return value XZ_OK + * is also possible and thus XZ_SINGLE is actually a limited multi-call mode. + * After XZ_OK the bytes decoded so far may be read from the output buffer. + * It is possible to continue decoding but the variables b->out and b->out_pos + * MUST NOT be changed by the caller. Increasing the value of b->out_size is + * allowed to make more output space available; one doesn't need to provide + * space for the whole uncompressed data on the first call. The input buffer + * may be changed normally like with XZ_PREALLOC. This way input data can be + * provided from non-contiguous memory. + */ +extern enum xz_ret xz_dec_microlzma_run(struct xz_dec_microlzma *s, + struct xz_buf *b); + +/** + * xz_dec_microlzma_end() - Free the memory allocated for the decoder state + * @s: Decoder state allocated using xz_dec_microlzma_alloc(). + * If s is NULL, this function does nothing. + */ +extern void xz_dec_microlzma_end(struct xz_dec_microlzma *s); + /* * Standalone build (userspace build or in-kernel build for boot time use) * needs a CRC32 implementation. For normal in-kernel use, kernel's own diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h index db4f2cec8360..16ae7b666810 100644 --- a/include/trace/events/erofs.h +++ b/include/trace/events/erofs.h @@ -24,7 +24,7 @@ struct erofs_map_blocks; #define show_mflags(flags) __print_flags(flags, "", \ { EROFS_MAP_MAPPED, "M" }, \ { EROFS_MAP_META, "I" }, \ - { EROFS_MAP_ZIPPED, "Z" }) + { EROFS_MAP_ENCODED, "E" }) TRACE_EVENT(erofs_lookup, diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 4cb055af1ec0..f8cb916f3595 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -570,9 +570,10 @@ TRACE_EVENT(f2fs_file_write_iter, ); TRACE_EVENT(f2fs_map_blocks, - TP_PROTO(struct inode *inode, struct f2fs_map_blocks *map, int ret), + TP_PROTO(struct inode *inode, struct f2fs_map_blocks *map, + int create, int flag, int ret), - TP_ARGS(inode, map, ret), + TP_ARGS(inode, map, create, flag, ret), TP_STRUCT__entry( __field(dev_t, dev) @@ -583,11 +584,14 @@ TRACE_EVENT(f2fs_map_blocks, __field(unsigned int, m_flags) __field(int, m_seg_type) __field(bool, m_may_create) + __field(bool, m_multidev_dio) + __field(int, create) + __field(int, flag) __field(int, ret) ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev = map->m_bdev->bd_dev; __entry->ino = inode->i_ino; __entry->m_lblk = map->m_lblk; __entry->m_pblk = map->m_pblk; @@ -595,12 +599,16 @@ TRACE_EVENT(f2fs_map_blocks, __entry->m_flags = map->m_flags; __entry->m_seg_type = map->m_seg_type; __entry->m_may_create = map->m_may_create; + __entry->m_multidev_dio = map->m_multidev_dio; + __entry->create = create; + __entry->flag = flag; __entry->ret = ret; ), TP_printk("dev = (%d,%d), ino = %lu, file offset = %llu, " - "start blkaddr = 0x%llx, len = 0x%llx, flags = %u," - "seg_type = %d, may_create = %d, err = %d", + "start blkaddr = 0x%llx, len = 0x%llx, flags = %u, " + "seg_type = %d, may_create = %d, multidevice = %d, " + "create = %d, flag = %d, err = %d", show_dev_ino(__entry), (unsigned long long)__entry->m_lblk, (unsigned long long)__entry->m_pblk, @@ -608,6 +616,9 @@ TRACE_EVENT(f2fs_map_blocks, __entry->m_flags, __entry->m_seg_type, __entry->m_may_create, + __entry->m_multidev_dio, + __entry->create, + __entry->flag, __entry->ret) ); diff --git a/lib/decompress_unxz.c b/lib/decompress_unxz.c index f7a3dc13316a..c1b89024f698 100644 --- a/lib/decompress_unxz.c +++ b/lib/decompress_unxz.c @@ -20,8 +20,8 @@ * * The worst case for in-place decompression is that the beginning of * the file is compressed extremely well, and the rest of the file is - * uncompressible. Thus, we must look for worst-case expansion when the - * compressor is encoding uncompressible data. + * incompressible. Thus, we must look for worst-case expansion when the + * compressor is encoding incompressible data. * * The structure of the .xz file in case of a compressed kernel is as follows. * Sizes (as bytes) of the fields are in parenthesis. @@ -58,7 +58,7 @@ * uncompressed size of the payload is in practice never less than the * payload size itself. The LZMA2 format would allow uncompressed size * to be less than the payload size, but no sane compressor creates such - * files. LZMA2 supports storing uncompressible data in uncompressed form, + * files. LZMA2 supports storing incompressible data in uncompressed form, * so there's never a need to create payloads whose uncompressed size is * smaller than the compressed size. * diff --git a/lib/xz/Kconfig b/lib/xz/Kconfig index 5cb50245a878..adce22ac18d6 100644 --- a/lib/xz/Kconfig +++ b/lib/xz/Kconfig @@ -39,6 +39,19 @@ config XZ_DEC_SPARC default y select XZ_DEC_BCJ +config XZ_DEC_MICROLZMA + bool "MicroLZMA decoder" + default n + help + MicroLZMA is a header format variant where the first byte + of a raw LZMA stream (without the end of stream marker) has + been replaced with a bitwise-negation of the lc/lp/pb + properties byte. MicroLZMA was created to be used in EROFS + but can be used by other things too where wasting minimal + amount of space for headers is important. + + Unless you know that you need this, say N. + endif config XZ_DEC_BCJ diff --git a/lib/xz/xz_dec_lzma2.c b/lib/xz/xz_dec_lzma2.c index d548cf0e59fe..27ce34520e78 100644 --- a/lib/xz/xz_dec_lzma2.c +++ b/lib/xz/xz_dec_lzma2.c @@ -248,6 +248,10 @@ struct lzma2_dec { * before the first LZMA chunk. */ bool need_props; + +#ifdef XZ_DEC_MICROLZMA + bool pedantic_microlzma; +#endif }; struct xz_dec_lzma2 { @@ -419,6 +423,12 @@ static void dict_uncompressed(struct dictionary *dict, struct xz_buf *b, } } +#ifdef XZ_DEC_MICROLZMA +# define DICT_FLUSH_SUPPORTS_SKIPPING true +#else +# define DICT_FLUSH_SUPPORTS_SKIPPING false +#endif + /* * Flush pending data from dictionary to b->out. It is assumed that there is * enough space in b->out. This is guaranteed because caller uses dict_limit() @@ -437,9 +447,14 @@ static uint32_t dict_flush(struct dictionary *dict, struct xz_buf *b) * decompression because in multi-call mode dict->buf * has been allocated by us in this file; it's not * provided by the caller like in single-call mode. + * + * With MicroLZMA, b->out can be NULL to skip bytes that + * the caller doesn't need. This cannot be done with XZ + * because it would break BCJ filters. */ - memcpy(b->out + b->out_pos, dict->buf + dict->start, - copy_size); + if (!DICT_FLUSH_SUPPORTS_SKIPPING || b->out != NULL) + memcpy(b->out + b->out_pos, dict->buf + dict->start, + copy_size); } dict->start = dict->pos; @@ -505,7 +520,7 @@ static __always_inline void rc_normalize(struct rc_dec *rc) * functions so that the compiler is supposed to be able to more easily avoid * an extra branch. In this particular version of the LZMA decoder, this * doesn't seem to be a good idea (tested with GCC 3.3.6, 3.4.6, and 4.3.3 - * on x86). Using a non-splitted version results in nicer looking code too. + * on x86). Using a non-split version results in nicer looking code too. * * NOTE: This must return an int. Do not make it return a bool or the speed * of the code generated by GCC 3.x decreases 10-15 %. (GCC 4.3 doesn't care, @@ -791,6 +806,7 @@ static void lzma_reset(struct xz_dec_lzma2 *s) s->lzma.rep1 = 0; s->lzma.rep2 = 0; s->lzma.rep3 = 0; + s->lzma.len = 0; /* * All probabilities are initialized to the same value. This hack @@ -1174,8 +1190,6 @@ XZ_EXTERN enum xz_ret xz_dec_lzma2_reset(struct xz_dec_lzma2 *s, uint8_t props) } } - s->lzma.len = 0; - s->lzma2.sequence = SEQ_CONTROL; s->lzma2.need_dict_reset = true; @@ -1191,3 +1205,140 @@ XZ_EXTERN void xz_dec_lzma2_end(struct xz_dec_lzma2 *s) kfree(s); } + +#ifdef XZ_DEC_MICROLZMA +/* This is a wrapper struct to have a nice struct name in the public API. */ +struct xz_dec_microlzma { + struct xz_dec_lzma2 s; +}; + +enum xz_ret xz_dec_microlzma_run(struct xz_dec_microlzma *s_ptr, + struct xz_buf *b) +{ + struct xz_dec_lzma2 *s = &s_ptr->s; + + /* + * sequence is SEQ_PROPERTIES before the first input byte, + * SEQ_LZMA_PREPARE until a total of five bytes have been read, + * and SEQ_LZMA_RUN for the rest of the input stream. + */ + if (s->lzma2.sequence != SEQ_LZMA_RUN) { + if (s->lzma2.sequence == SEQ_PROPERTIES) { + /* One byte is needed for the props. */ + if (b->in_pos >= b->in_size) + return XZ_OK; + + /* + * Don't increment b->in_pos here. The same byte is + * also passed to rc_read_init() which will ignore it. + */ + if (!lzma_props(s, ~b->in[b->in_pos])) + return XZ_DATA_ERROR; + + s->lzma2.sequence = SEQ_LZMA_PREPARE; + } + + /* + * xz_dec_microlzma_reset() doesn't validate the compressed + * size so we do it here. We have to limit the maximum size + * to avoid integer overflows in lzma2_lzma(). 3 GiB is a nice + * round number and much more than users of this code should + * ever need. + */ + if (s->lzma2.compressed < RC_INIT_BYTES + || s->lzma2.compressed > (3U << 30)) + return XZ_DATA_ERROR; + + if (!rc_read_init(&s->rc, b)) + return XZ_OK; + + s->lzma2.compressed -= RC_INIT_BYTES; + s->lzma2.sequence = SEQ_LZMA_RUN; + + dict_reset(&s->dict, b); + } + + /* This is to allow increasing b->out_size between calls. */ + if (DEC_IS_SINGLE(s->dict.mode)) + s->dict.end = b->out_size - b->out_pos; + + while (true) { + dict_limit(&s->dict, min_t(size_t, b->out_size - b->out_pos, + s->lzma2.uncompressed)); + + if (!lzma2_lzma(s, b)) + return XZ_DATA_ERROR; + + s->lzma2.uncompressed -= dict_flush(&s->dict, b); + + if (s->lzma2.uncompressed == 0) { + if (s->lzma2.pedantic_microlzma) { + if (s->lzma2.compressed > 0 || s->lzma.len > 0 + || !rc_is_finished(&s->rc)) + return XZ_DATA_ERROR; + } + + return XZ_STREAM_END; + } + + if (b->out_pos == b->out_size) + return XZ_OK; + + if (b->in_pos == b->in_size + && s->temp.size < s->lzma2.compressed) + return XZ_OK; + } +} + +struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode, + uint32_t dict_size) +{ + struct xz_dec_microlzma *s; + + /* Restrict dict_size to the same range as in the LZMA2 code. */ + if (dict_size < 4096 || dict_size > (3U << 30)) + return NULL; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (s == NULL) + return NULL; + + s->s.dict.mode = mode; + s->s.dict.size = dict_size; + + if (DEC_IS_MULTI(mode)) { + s->s.dict.end = dict_size; + + s->s.dict.buf = vmalloc(dict_size); + if (s->s.dict.buf == NULL) { + kfree(s); + return NULL; + } + } + + return s; +} + +void xz_dec_microlzma_reset(struct xz_dec_microlzma *s, uint32_t comp_size, + uint32_t uncomp_size, int uncomp_size_is_exact) +{ + /* + * comp_size is validated in xz_dec_microlzma_run(). + * uncomp_size can safely be anything. + */ + s->s.lzma2.compressed = comp_size; + s->s.lzma2.uncompressed = uncomp_size; + s->s.lzma2.pedantic_microlzma = uncomp_size_is_exact; + + s->s.lzma2.sequence = SEQ_PROPERTIES; + s->s.temp.size = 0; +} + +void xz_dec_microlzma_end(struct xz_dec_microlzma *s) +{ + if (DEC_IS_MULTI(s->s.dict.mode)) + vfree(s->s.dict.buf); + + kfree(s); +} +#endif diff --git a/lib/xz/xz_dec_syms.c b/lib/xz/xz_dec_syms.c index 32eb3c03aede..61098c67a413 100644 --- a/lib/xz/xz_dec_syms.c +++ b/lib/xz/xz_dec_syms.c @@ -15,8 +15,15 @@ EXPORT_SYMBOL(xz_dec_reset); EXPORT_SYMBOL(xz_dec_run); EXPORT_SYMBOL(xz_dec_end); +#ifdef CONFIG_XZ_DEC_MICROLZMA +EXPORT_SYMBOL(xz_dec_microlzma_alloc); +EXPORT_SYMBOL(xz_dec_microlzma_reset); +EXPORT_SYMBOL(xz_dec_microlzma_run); +EXPORT_SYMBOL(xz_dec_microlzma_end); +#endif + MODULE_DESCRIPTION("XZ decompressor"); -MODULE_VERSION("1.0"); +MODULE_VERSION("1.1"); MODULE_AUTHOR("Lasse Collin and Igor Pavlov"); /* diff --git a/lib/xz/xz_private.h b/lib/xz/xz_private.h index 09360ebb510e..bf1e94ec7873 100644 --- a/lib/xz/xz_private.h +++ b/lib/xz/xz_private.h @@ -37,6 +37,9 @@ # ifdef CONFIG_XZ_DEC_SPARC # define XZ_DEC_SPARC # endif +# ifdef CONFIG_XZ_DEC_MICROLZMA +# define XZ_DEC_MICROLZMA +# endif # define memeq(a, b, size) (memcmp(a, b, size) == 0) # define memzero(buf, size) memset(buf, 0, size) # endif