Commit Graph

968972 Commits

Author SHA1 Message Date
Chao Yu
fafe5fbb01 f2fs: fix to account missing .skipped_gc_rwsem
There is a missing place we forgot to account .skipped_gc_rwsem, fix it.

Fixes: 6f8d445506 ("f2fs: avoid fi->i_gc_rwsem[WRITE] lock in f2fs_gc")
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-30 17:57:50 -07:00
Chao Yu
b177568d2a f2fs: adjust unlock order for cleanup
This patch adjusts unlock order of .i_mmap_sem and .i_gc_rwsem for
cleanup.

Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-30 17:57:50 -07:00
Christian Brauner
9c3a9e03e7 mount: fix mounting of detached mounts onto targets that reside on shared mounts
Creating a series of detached mounts, attaching them to the filesystem,
and unmounting them can be used to trigger an integer overflow in
ns->mounts causing the kernel to block any new mounts in count_mounts()
and returning ENOSPC because it falsely assumes that the maximum number
of mounts in the mount namespace has been reached, i.e. it thinks it
can't fit the new mounts into the mount namespace anymore.

Depending on the number of mounts in your system, this can be reproduced
on any kernel that supportes open_tree() and move_mount() by compiling
and running the following program:

  /* SPDX-License-Identifier: LGPL-2.1+ */

  #define _GNU_SOURCE
  #include <errno.h>
  #include <fcntl.h>
  #include <getopt.h>
  #include <limits.h>
  #include <stdbool.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <string.h>
  #include <sys/mount.h>
  #include <sys/stat.h>
  #include <sys/syscall.h>
  #include <sys/types.h>
  #include <unistd.h>

  /* open_tree() */
  #ifndef OPEN_TREE_CLONE
  #define OPEN_TREE_CLONE 1
  #endif

  #ifndef OPEN_TREE_CLOEXEC
  #define OPEN_TREE_CLOEXEC O_CLOEXEC
  #endif

  #ifndef __NR_open_tree
          #if defined __alpha__
                  #define __NR_open_tree 538
          #elif defined _MIPS_SIM
                  #if _MIPS_SIM == _MIPS_SIM_ABI32        /* o32 */
                          #define __NR_open_tree 4428
                  #endif
                  #if _MIPS_SIM == _MIPS_SIM_NABI32       /* n32 */
                          #define __NR_open_tree 6428
                  #endif
                  #if _MIPS_SIM == _MIPS_SIM_ABI64        /* n64 */
                          #define __NR_open_tree 5428
                  #endif
          #elif defined __ia64__
                  #define __NR_open_tree (428 + 1024)
          #else
                  #define __NR_open_tree 428
          #endif
  #endif

  /* move_mount() */
  #ifndef MOVE_MOUNT_F_EMPTY_PATH
  #define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */
  #endif

  #ifndef __NR_move_mount
          #if defined __alpha__
                  #define __NR_move_mount 539
          #elif defined _MIPS_SIM
                  #if _MIPS_SIM == _MIPS_SIM_ABI32        /* o32 */
                          #define __NR_move_mount 4429
                  #endif
                  #if _MIPS_SIM == _MIPS_SIM_NABI32       /* n32 */
                          #define __NR_move_mount 6429
                  #endif
                  #if _MIPS_SIM == _MIPS_SIM_ABI64        /* n64 */
                          #define __NR_move_mount 5429
                  #endif
          #elif defined __ia64__
                  #define __NR_move_mount (428 + 1024)
          #else
                  #define __NR_move_mount 429
          #endif
  #endif

  static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags)
  {
          return syscall(__NR_open_tree, dfd, filename, flags);
  }

  static inline int sys_move_mount(int from_dfd, const char *from_pathname, int to_dfd,
                                   const char *to_pathname, unsigned int flags)
  {
          return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd, to_pathname, flags);
  }

  static bool is_shared_mountpoint(const char *path)
  {
          bool shared = false;
          FILE *f = NULL;
          char *line = NULL;
          int i;
          size_t len = 0;

          f = fopen("/proc/self/mountinfo", "re");
          if (!f)
                  return 0;

          while (getline(&line, &len, f) > 0) {
                  char *slider1, *slider2;

                  for (slider1 = line, i = 0; slider1 && i < 4; i++)
                          slider1 = strchr(slider1 + 1, ' ');

                  if (!slider1)
                          continue;

                  slider2 = strchr(slider1 + 1, ' ');
                  if (!slider2)
                          continue;

                  *slider2 = '\0';
                  if (strcmp(slider1 + 1, path) == 0) {
                          /* This is the path. Is it shared? */
                          slider1 = strchr(slider2 + 1, ' ');
                          if (slider1 && strstr(slider1, "shared:")) {
                                  shared = true;
                                  break;
                          }
                  }
          }
          fclose(f);
          free(line);

          return shared;
  }

  static void usage(void)
  {
          const char *text = "mount-new [--recursive] <base-dir>\n";
          fprintf(stderr, "%s", text);
          _exit(EXIT_SUCCESS);
  }

  #define exit_usage(format, ...)                              \
          ({                                                   \
                  fprintf(stderr, format "\n", ##__VA_ARGS__); \
                  usage();                                     \
          })

  #define exit_log(format, ...)                                \
          ({                                                   \
                  fprintf(stderr, format "\n", ##__VA_ARGS__); \
                  exit(EXIT_FAILURE);                          \
          })

  static const struct option longopts[] = {
          {"help",        no_argument,            0,      'a'},
          { NULL,         no_argument,            0,       0 },
  };

  int main(int argc, char *argv[])
  {
          int exit_code = EXIT_SUCCESS, index = 0;
          int dfd, fd_tree, new_argc, ret;
          char *base_dir;
          char *const *new_argv;
          char target[PATH_MAX];

          while ((ret = getopt_long_only(argc, argv, "", longopts, &index)) != -1) {
                  switch (ret) {
                  case 'a':
                          /* fallthrough */
                  default:
                          usage();
                  }
          }

          new_argv = &argv[optind];
          new_argc = argc - optind;
          if (new_argc < 1)
                  exit_usage("Missing base directory\n");
          base_dir = new_argv[0];

          if (*base_dir != '/')
                  exit_log("Please specify an absolute path");

          /* Ensure that target is a shared mountpoint. */
          if (!is_shared_mountpoint(base_dir))
                  exit_log("Please ensure that \"%s\" is a shared mountpoint", base_dir);

          dfd = open(base_dir, O_RDONLY | O_DIRECTORY | O_CLOEXEC);
          if (dfd < 0)
                  exit_log("%m - Failed to open base directory \"%s\"", base_dir);

          ret = mkdirat(dfd, "detached-move-mount", 0755);
          if (ret < 0)
                  exit_log("%m - Failed to create required temporary directories");

          ret = snprintf(target, sizeof(target), "%s/detached-move-mount", base_dir);
          if (ret < 0 || (size_t)ret >= sizeof(target))
                  exit_log("%m - Failed to assemble target path");

          /*
           * Having a mount table with 10000 mounts is already quite excessive
           * and shoult account even for weird test systems.
           */
          for (size_t i = 0; i < 10000; i++) {
                  fd_tree = sys_open_tree(dfd, "detached-move-mount",
                                          OPEN_TREE_CLONE |
                                          OPEN_TREE_CLOEXEC |
                                          AT_EMPTY_PATH);
                  if (fd_tree < 0) {
                          fprintf(stderr, "%m - Failed to open %d(detached-move-mount)", dfd);
                          exit_code = EXIT_FAILURE;
                          break;
                  }

                  ret = sys_move_mount(fd_tree, "", dfd, "detached-move-mount", MOVE_MOUNT_F_EMPTY_PATH);
                  if (ret < 0) {
                          if (errno == ENOSPC)
                                  fprintf(stderr, "%m - Buggy mount counting");
                          else
                                  fprintf(stderr, "%m - Failed to attach mount to %d(detached-move-mount)", dfd);
                          exit_code = EXIT_FAILURE;
                          break;
                  }
                  close(fd_tree);

                  ret = umount2(target, MNT_DETACH);
                  if (ret < 0) {
                          fprintf(stderr, "%m - Failed to unmount %s", target);
                          exit_code = EXIT_FAILURE;
                          break;
                  }
          }

          (void)unlinkat(dfd, "detached-move-mount", AT_REMOVEDIR);
          close(dfd);

          exit(exit_code);
  }

and wait for the kernel to refuse any new mounts by returning ENOSPC.
How many iterations are needed depends on the number of mounts in your
system. Assuming you have something like 50 mounts on a standard system
it should be almost instantaneous.

The root cause of this is that detached mounts aren't handled correctly
when source and target mount are identical and reside on a shared mount
causing a broken mount tree where the detached source itself is
propagated which propagation prevents for regular bind-mounts and new
mounts. This ultimately leads to a miscalculation of the number of
mounts in the mount namespace.

Detached mounts created via
open_tree(fd, path, OPEN_TREE_CLONE)
are essentially like an unattached new mount, or an unattached
bind-mount. They can then later on be attached to the filesystem via
move_mount() which calls into attach_recursive_mount(). Part of
attaching it to the filesystem is making sure that mounts get correctly
propagated in case the destination mountpoint is MS_SHARED, i.e. is a
shared mountpoint. This is done by calling into propagate_mnt() which
walks the list of peers calling propagate_one() on each mount in this
list making sure it receives the propagation event.
The propagate_one() functions thereby skips both new mounts and bind
mounts to not propagate them "into themselves". Both are identified by
checking whether the mount is already attached to any mount namespace in
mnt->mnt_ns. The is what the IS_MNT_NEW() helper is responsible for.

However, detached mounts have an anonymous mount namespace attached to
them stashed in mnt->mnt_ns which means that IS_MNT_NEW() doesn't
realize they need to be skipped causing the mount to propagate "into
itself" breaking the mount table and causing a disconnect between the
number of mounts recorded as being beneath or reachable from the target
mountpoint and the number of mounts actually recorded/counted in
ns->mounts ultimately causing an overflow which in turn prevents any new
mounts via the ENOSPC issue.

So teach propagation to handle detached mounts by making it aware of
them. I've been tracking this issue down for the last couple of days and
then verifying that the fix is correct by
unmounting everything in my current mount table leaving only /proc and
/sys mounted and running the reproducer above overnight verifying the
number of mounts counted in ns->mounts. With this fix the counts are
correct and the ENOSPC issue can't be reproduced.

This change will only have an effect on mounts created with the new
mount API since detached mounts cannot be created with the old mount API
so regressions are extremely unlikely.

Link: https://lore.kernel.org/r/20210306101010.243666-1-christian.brauner@ubuntu.com
Fixes: 2db154b3ea ("vfs: syscall: Add move_mount(2) to move mounts around")
Cc: David Howells <dhowells@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: <stable@vger.kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
2021-08-30 17:57:50 -07:00
Fengnan Chang
07290879f7 f2fs: Don't create discard thread when device doesn't support realtime discard
Don't create discard thread when device doesn't support realtime discard
or user specifies nodiscard mount option.

Signed-off-by: Fengnan Chang <changfengnan@vivo.com>
Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-30 17:57:49 -07:00
Chao Yu
5145b2306f f2fs: rebuild nat_bits during umount
If all free_nat_bitmap are available, we can rebuild nat_bits from
free_nat_bitmap entirely during umount, let's make another chance
to reenable nat_bits for image.

Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-23 13:59:00 -07:00
Daeho Jeong
d95bf22a4d f2fs: introduce periodic iostat io latency traces
Whenever we notice some sluggish issues on our machines, we are always
curious about how well all types of I/O in the f2fs filesystem are
handled. But, it's hard to get this kind of real data. First of all,
we need to reproduce the issue while turning on the profiling tool like
blktrace, but the issue doesn't happen again easily. Second, with the
intervention of any tools, the overall timing of the issue will be
slightly changed and it sometimes makes us hard to figure it out.

So, I added the feature printing out IO latency statistics tracepoint
events, which are minimal things to understand filesystem's I/O related
behaviors, into F2FS_IOSTAT kernel config. With "iostat_enable" sysfs
node on, we can get this statistics info in a periodic way and it
would cause the least overhead.

[samples]
 f2fs_ckpt-254:1-507     [003] ....  2842.439683: f2fs_iostat_latency:
dev = (254,11), iotype [peak lat.(ms)/avg lat.(ms)/count],
rd_data [136/1/801], rd_node [136/1/1704], rd_meta [4/2/4],
wr_sync_data [164/16/3331], wr_sync_node [152/3/648],
wr_sync_meta [160/2/4243], wr_async_data [24/13/15],
wr_async_node [0/0/0], wr_async_meta [0/0/0]

 f2fs_ckpt-254:1-507     [002] ....  2845.450514: f2fs_iostat_latency:
dev = (254,11), iotype [peak lat.(ms)/avg lat.(ms)/count],
rd_data [60/3/456], rd_node [60/3/1258], rd_meta [0/0/1],
wr_sync_data [120/12/2285], wr_sync_node [88/5/428],
wr_sync_meta [52/6/2990], wr_async_data [4/1/3],
wr_async_node [0/0/0], wr_async_meta [0/0/0]

Signed-off-by: Daeho Jeong <daehojeong@google.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-23 13:58:56 -07:00
Daeho Jeong
d20508acee f2fs: separate out iostat feature
Added F2FS_IOSTAT config option to support getting IO statistics through
sysfs and printing out periodic IO statistics tracepoint events and
moved I/O statistics related codes into separate files for better
maintenance.

Signed-off-by: Daeho Jeong <daehojeong@google.com>
Reviewed-by: Chao Yu <chao@kernel.org>
[Jaegeuk Kim: set default=y]
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-23 13:58:53 -07:00
Chao Yu
32935b532d f2fs: compress: do sanity check on cluster
This patch adds f2fs_sanity_check_cluster() to support doing
sanity check on cluster of compressed file, it will be triggered
from below two paths:

- __f2fs_cluster_blocks()
- f2fs_map_blocks(F2FS_GET_BLOCK_FIEMAP)

And it can detect below three kind of cluster insanity status.

C: COMPRESS_ADDR
N: NULL_ADDR or NEW_ADDR
V: valid blkaddr
*: any value

1. [*|C|*|*]
2. [C|*|C|*]
3. [C|N|N|V]

Signed-off-by: Chao Yu <chao@kernel.org>
[Nathan Chancellor: fix missing inline warning]
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-23 13:58:50 -07:00
Yangtao Li
fcf8c77c97 f2fs: fix description about main_blkaddr node
Don't leave a blank line, to keep the style consistent
with other node descriptions.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-23 13:58:47 -07:00
Yangtao Li
35f8bc157f f2fs: convert S_IRUGO to 0444
To fix:
WARNING: Symbolic permissions 'S_IRUGO' are not preferred. Consider using octal permissions '0444'.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-23 13:58:43 -07:00
Chao Yu
717d1e903c f2fs: fix to keep compatibility of fault injection interface
The value of FAULT_* macros and its description in f2fs.rst became
inconsistent, fix this to keep compatibility of fault injection
interface.

Fixes: 67883ade7a ("f2fs: remove FAULT_ALLOC_BIO")
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-23 13:58:38 -07:00
Chao Yu
4ab362d8ca f2fs: support fault injection for f2fs_kmem_cache_alloc()
This patch supports to inject fault into f2fs_kmem_cache_alloc().

Usage:
a) echo 32768 > /sys/fs/f2fs/<dev>/inject_type or
b) mount -o fault_type=32768 <dev> <mountpoint>

Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-23 13:58:38 -07:00
Fengnan Chang
9d05ffd991 f2fs: compress: allow write compress released file after truncate to zero
For compressed file, after release compress blocks, don't allow write
direct, but we should allow write direct after truncate to zero.

Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Fengnan Chang <changfengnan@vivo.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-23 13:58:37 -07:00
Yangtao Li
7a11fe8af9 f2fs: correct comment in segment.h
s/two/three

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-12 14:21:57 -07:00
Yangtao Li
b771a85432 f2fs: improve sbi status info in debugfs/f2fs/status
Do not use numbers but strings to improve readability when flag is set.

Signed-off-by: Yangtao Li <frank.li@vivo.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-12 14:21:53 -07:00
Fengnan Chang
60a81e6cf3 f2fs: compress: avoid duplicate counting of valid blocks when read compressed file
Since cluster is basic unit of compression, one cluster is compressed or
not, so we can calculate valid blocks only for first page in cluster,
the other pages just skip.

Signed-off-by: Fengnan Chang <changfengnan@vivo.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-12 14:21:50 -07:00
Chao Yu
24d236c4fa f2fs: fix to do sanity check for sb/cp fields correctly
This patch fixes below problems of sb/cp sanity check:
- in sanity_check_raw_superi(), it missed to consider log header
blocks while cp_payload check.
- in f2fs_sanity_check_ckpt(), it missed to check nat_bits_blocks.

Cc: <stable@kernel.org>
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-12 14:21:46 -07:00
Chao Yu
e2a7d2733b f2fs: avoid unneeded memory allocation in __add_ino_entry()
__add_ino_entry() will allocate slab cache even if we have already
cached ino entry in radix tree, e.g. for case of multiple devices.

Let's check radix tree first under protection of rcu lock to see
whether we need to do slab allocation, it will mitigate memory
pressure from "f2fs_ino_entry" slab cache.

Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-12 14:21:42 -07:00
Chao Yu
627371ed31 f2fs: extent cache: support unaligned extent
Compressed inode may suffer read performance issue due to it can not
use extent cache, so I propose to add this unaligned extent support
to improve it.

Currently, it only works in readonly format f2fs image.

Unaligned extent: in one compressed cluster, physical block number
will be less than logical block number, so we add an extra physical
block length in extent info in order to indicate such extent status.

The idea is if one whole cluster blocks are contiguous physically,
once its mapping info was readed at first time, we will cache an
unaligned (or aligned) extent info entry in extent cache, it expects
that the mapping info will be hitted when rereading cluster.

Merge policy:
- Aligned extents can be merged.
- Aligned extent and unaligned extent can not be merged.

Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-05 12:02:51 -07:00
Tiezhu Yang
62d321c3b8 f2fs: Kconfig: clean up config options about compression
In fs/f2fs/Kconfig, F2FS_FS_LZ4HC depends on F2FS_FS_LZ4 and F2FS_FS_LZ4
depends on F2FS_FS_COMPRESSION, so no need to make F2FS_FS_LZ4HC depends
on F2FS_FS_COMPRESSION explicitly, remove the redudant "depends on", do
the similar thing for F2FS_FS_LZORLE.

At the same time, it is better to move F2FS_FS_LZORLE next to F2FS_FS_LZO,
it looks like a little more clear when make menuconfig, the location of
"LZO-RLE compression support" is under "LZO compression support" instead
of "F2FS compression feature".

Without this patch:

F2FS compression feature
  LZO compression support
  LZ4 compression support
    LZ4HC compression support
  ZSTD compression support
  LZO-RLE compression support

With this patch:

F2FS compression feature
  LZO compression support
    LZO-RLE compression support
  LZ4 compression support
    LZ4HC compression support
  ZSTD compression support

Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-05 12:02:47 -07:00
Yangtao Li
7b9b92ff1b f2fs: reduce the scope of setting fsck tag when de->name_len is zero
I recently found a case where de->name_len is 0 in f2fs_fill_dentries()
easily reproduced, and finally set the fsck flag.

Thread A			Thread B
- f2fs_readdir
 - f2fs_read_inline_dir
  - ctx->pos = d.max
				- f2fs_add_dentry
				 - f2fs_add_inline_entry
				  - do_convert_inline_dir
				 - f2fs_add_regular_entry
- f2fs_readdir
 - f2fs_fill_dentries
  - set_sbi_flag(sbi, SBI_NEED_FSCK)

Process A opens the folder, and has been reading without closing it.
During this period, Process B created a file under the folder (occupying
multiple f2fs_dir_entry, exceeding the d.max of the inline dir). After
creation, process A uses the d.max of inline dir to read it again, and
it will read that de->name_len is 0.

And Chao pointed out that w/o inline conversion, the race condition still
can happen as below:

dir_entry1: A
dir_entry2: B
dir_entry3: C
free slot: _
ctx->pos: ^

Thread A is traversing directory,
ctx-pos moves to below position after readdir() by thread A:
AAAABBBB___
        ^

Then thread B delete dir_entry2, and create dir_entry3.

Thread A calls readdir() to lookup dirents starting from middle
of new dirent slots as below:
AAAACCCCCC_
        ^
In these scenarios, the file system is not damaged, and it's hard to
avoid it. But we can bypass tagging FSCK flag if:
a) bit_pos (:= ctx->pos % d->max) is non-zero and
b) before bit_pos moves to first valid dir_entry.

Fixes: ddf06b753a ("f2fs: fix to trigger fsck if dirent.name_len is zero")
Signed-off-by: Yangtao Li <frank.li@vivo.com>
[Chao: clean up description]
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-04 16:15:37 -07:00
Chao Yu
3f4f0468db f2fs: fix to stop filesystem update once CP failed
During f2fs_write_checkpoint(), once we failed in
f2fs_flush_nat_entries() or do_checkpoint(), metadata of filesystem
such as prefree bitmap, nat/sit version bitmap won't be recovered,
it may cause f2fs image to be inconsistent, let's just set CP error
flag to avoid further updates until we figure out a scheme to rollback
all metadatas in such condition.

Reported-by: Yangtao Li <frank.li@vivo.com>
Signed-off-by: Yangtao Li <frank.li@vivo.com>
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-04 16:15:33 -07:00
Daeho Jeong
94400420e1 f2fs: add sysfs node to control ra_pages for fadvise seq file
fadvise() allows the user to expand the readahead window to double with
POSIX_FADV_SEQUENTIAL, now. But, in some use cases, it is not that
sufficient and we need to meet the need in a restricted way. We can
control the multiplier value of bdi device readahead between 2 (default)
and 256 for POSIX_FADV_SEQUENTIAL advise option.

Signed-off-by: Daeho Jeong <daehojeong@google.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-04 16:15:29 -07:00
Chao Yu
b4333197db f2fs: introduce discard_unit mount option
As James Z reported in bugzilla:

https://bugzilla.kernel.org/show_bug.cgi?id=213877

[1.] One-line summary of the problem:
Mount multiple SMR block devices exceed certain number cause system non-response

[2.] Full description of the problem/report:
Created some F2FS on SMR devices (mkfs.f2fs -m), then mounted in sequence. Each device is the same Model: HGST HSH721414AL (Size 14TB).
Empirically, found that when the amount of SMR device * 1.5Gb > System RAM, the system ran out of memory and hung. No dmesg output. For example, 24 SMR Disk need 24*1.5GB = 36GB. A system with 32G RAM can only mount 21 devices, the 22nd device will be a reproducible cause of system hang.
The number of SMR devices with other FS mounted on this system does not interfere with the result above.

[3.] Keywords (i.e., modules, networking, kernel):
F2FS, SMR, Memory

[4.] Kernel information
[4.1.] Kernel version (uname -a):
Linux 5.13.4-200.fc34.x86_64 #1 SMP Tue Jul 20 20:27:29 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux

[4.2.] Kernel .config file:
Default Fedora 34 with f2fs-tools-1.14.0-2.fc34.x86_64

[5.] Most recent kernel version which did not have the bug:
None

[6.] Output of Oops.. message (if applicable) with symbolic information
     resolved (see Documentation/admin-guide/oops-tracing.rst)
None

[7.] A small shell script or example program which triggers the
     problem (if possible)
mount /dev/sdX /mnt/0X

[8.] Memory consumption

With 24 * 14T SMR Block device with F2FS
free -g
              total        used        free      shared  buff/cache   available
Mem:             46          36           0           0          10          10
Swap:             0           0           0

With 3 * 14T SMR Block device with F2FS
free -g
               total        used        free      shared  buff/cache   available
Mem:               7           5           0           0           1           1
Swap:              7           0           7

The root cause is, there are three bitmaps:
- cur_valid_map
- ckpt_valid_map
- discard_map
and each of them will cost ~500MB memory, {cur, ckpt}_valid_map are
necessary, but discard_map is optional, since this bitmap will only be
useful in mountpoint that small discard is enabled.

For a blkzoned device such as SMR or ZNS devices, f2fs will only issue
discard for a section(zone) when all blocks of that section are invalid,
so, for such device, we don't need small discard functionality at all.

This patch introduces a new mountoption "discard_unit=block|segment|
section" to support issuing discard with different basic unit which is
aligned to block, segment or section, so that user can specify
"discard_unit=segment" or "discard_unit=section" to disable small
discard functionality.

Note that this mount option can not be changed by remount() due to
related metadata need to be initialized during mount().

In order to save memory, let's use "discard_unit=section" for blkzoned
device by default.

Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-04 16:14:21 -07:00
Laibin Qiu
bfb4379b21 f2fs: fix min_seq_blocks can not make sense in some scenes.
F2FS have dirty page count control for batched sequential
write in writepages, and get the value of min_seq_blocks by
blocks_per_seg * segs_per_sec(segs_per_sec defaults to 1).
But in some scenes we set a lager section size, Min_seq_blocks
will become too large to achieve the expected effect(eg. 4thread
sequential write, the number of merge requests will be reduced).

Signed-off-by: Laibin Qiu <qiulaibin@huawei.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-04 16:14:17 -07:00
Chao Yu
673d77626c f2fs: fix to force keeping write barrier for strict fsync mode
[1] https://www.mail-archive.com/linux-f2fs-devel@lists.sourceforge.net/msg15126.html

As [1] reported, if lower device doesn't support write barrier, in below
case:

- write page #0; persist
- overwrite page #0
- fsync
 - write data page #0 OPU into device's cache
 - write inode page into device's cache
 - issue flush

If SPO is triggered during flush command, inode page can be persisted
before data page #0, so that after recovery, inode page can be recovered
with new physical block address of data page #0, however there may
contains dummy data in new physical block address.

Then what user will see is: after overwrite & fsync + SPO, old data in
file was corrupted, if any user do care about such case, we can suggest
user to use STRICT fsync mode, in this mode, we will force to use atomic
write sematics to keep write order in between data/node and last node,
so that it avoids potential data corruption during fsync().

Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-04 16:14:14 -07:00
Chao Yu
8b68b2810a f2fs: fix wrong checkpoint_changed value in f2fs_remount()
In f2fs_remount(), return value of test_opt() is an unsigned int type
variable, however when we compare it to a bool type variable, it cause
wrong result, fix it.

Fixes: 4354994f09 ("f2fs: checkpoint disabling")
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-04 16:14:10 -07:00
Jaegeuk Kim
125d10640c f2fs: show sbi status in debugfs/f2fs/status
We need to get sbi->s_flag to understand the current f2fs status as well.
One example is SBI_NEED_FSCK.

Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-04 16:14:07 -07:00
Daeho Jeong
fce7eb6bbb f2fs: turn back remapped address in compressed page endio
Turned back the remmaped sector address to the address in the partition,
when ending io, for compress cache to work properly.

Fixes: 6ce19aff0b ("f2fs: compress: add compress_inode to cache
compressed blocks")
Signed-off-by: Daeho Jeong <daehojeong@google.com>
Signed-off-by: Youngjin Gil <youngjin.gil@samsung.com>
Signed-off-by: Hyeong Jun Kim <hj514.kim@samsung.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-04 16:14:04 -07:00
Daeho Jeong
eb7b3d4bcc f2fs: change fiemap way in printing compression chunk
When we print out a discontinuous compression chunk, it shows like a
continuous chunk now. To show it more correctly, I've changed the way of
printing fiemap info like below. Plus, eliminated NEW_ADDR(-1) in fiemap
info, since it is not in fiemap user api manual.

Let's assume 16KB compression cluster.

<before>
   Logical          Physical         Length           Flags
0:  0000000000000000 00000002c091f000 0000000000004000 1008
1:  0000000000004000 00000002c0920000 0000000000004000 1008
  ...
9:  0000000000034000 0000000f8c623000 0000000000004000 1008
10: 0000000000038000 000000101a6eb000 0000000000004000 1008

<after>
0:  0000000000000000 00000002c091f000 0000000000004000 1008
1:  0000000000004000 00000002c0920000 0000000000004000 1008
  ...
9:  0000000000034000 0000000f8c623000 0000000000001000 1008
10: 0000000000035000 000000101a6ea000 0000000000003000 1008
11: 0000000000038000 000000101a6eb000 0000000000002000 1008
12: 000000000003a000 00000002c3544000 0000000000002000 1008

Flags
0x1000 => FIEMAP_EXTENT_MERGED
0x0008 => FIEMAP_EXTENT_ENCODED

Signed-off-by: Daeho Jeong <daehojeong@google.com>
Tested-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-04 16:14:01 -07:00
Jaegeuk Kim
e262b6b313 f2fs: do not submit NEW_ADDR to read node block
After the below patch, give cp is errored, we drop dirty node pages. This
can give NEW_ADDR to read node pages. Don't do WARN_ON() which gives
generic/475 failure.

Fixes: 28607bf3aa ("f2fs: drop dirty node pages when cp is in error status")
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-04 16:13:57 -07:00
Fengnan Chang
8d90322674 f2fs: compress: remove unneeded read when rewrite whole cluster
when we overwrite the whole page in cluster, we don't need read original
data before write, because after write_end(), writepages() can help to
load left data in that cluster.

Signed-off-by: Fengnan Chang <changfengnan@vivo.com>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-04 16:13:54 -07:00
Jaegeuk Kim
27d5563ccf f2fs: don't sleep while grabing nat_tree_lock
This tries to fix priority inversion in the below condition resulting in
long checkpoint delay.

f2fs_get_node_info()
 - nat_tree_lock
  -> sleep to grab journal_rwsem by contention

                                     checkpoint
                                     - waiting for nat_tree_lock

In order to let checkpoint go, let's release nat_tree_lock, if there's a
journal_rwsem contention.

Signed-off-by: Daeho Jeong <daehojeong@google.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-07-23 16:03:02 -07:00
Eric Biggers
cef541ebfd f2fs: remove allow_outplace_dio()
We can just check f2fs_lfs_mode() directly.  The block_unaligned_IO()
check is redundant because in LFS mode, f2fs doesn't do direct I/O
writes that aren't block-aligned (due to f2fs_force_buffered_io()
returning true in this case, triggering the fallback to buffered I/O).

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-07-22 14:59:40 -07:00
Eric Biggers
c70a6087b6 f2fs: make f2fs_write_failed() take struct inode
Make f2fs_write_failed() take a 'struct inode' directly rather than a
'struct address_space', as this simplifies it slightly.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-07-22 14:59:38 -07:00
Chao Yu
f516b460b8 f2fs: quota: fix potential deadlock
xfstest generic/587 reports a deadlock issue as below:

======================================================
WARNING: possible circular locking dependency detected
5.14.0-rc1 #69 Not tainted
------------------------------------------------------
repquota/8606 is trying to acquire lock:
ffff888022ac9320 (&sb->s_type->i_mutex_key#18){+.+.}-{3:3}, at: f2fs_quota_sync+0x207/0x300 [f2fs]

but task is already holding lock:
ffff8880084bcde8 (&sbi->quota_sem){.+.+}-{3:3}, at: f2fs_quota_sync+0x59/0x300 [f2fs]

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #2 (&sbi->quota_sem){.+.+}-{3:3}:
       __lock_acquire+0x648/0x10b0
       lock_acquire+0x128/0x470
       down_read+0x3b/0x2a0
       f2fs_quota_sync+0x59/0x300 [f2fs]
       f2fs_quota_on+0x48/0x100 [f2fs]
       do_quotactl+0x5e3/0xb30
       __x64_sys_quotactl+0x23a/0x4e0
       do_syscall_64+0x3b/0x90
       entry_SYSCALL_64_after_hwframe+0x44/0xae

-> #1 (&sbi->cp_rwsem){++++}-{3:3}:
       __lock_acquire+0x648/0x10b0
       lock_acquire+0x128/0x470
       down_read+0x3b/0x2a0
       f2fs_unlink+0x353/0x670 [f2fs]
       vfs_unlink+0x1c7/0x380
       do_unlinkat+0x413/0x4b0
       __x64_sys_unlinkat+0x50/0xb0
       do_syscall_64+0x3b/0x90
       entry_SYSCALL_64_after_hwframe+0x44/0xae

-> #0 (&sb->s_type->i_mutex_key#18){+.+.}-{3:3}:
       check_prev_add+0xdc/0xb30
       validate_chain+0xa67/0xb20
       __lock_acquire+0x648/0x10b0
       lock_acquire+0x128/0x470
       down_write+0x39/0xc0
       f2fs_quota_sync+0x207/0x300 [f2fs]
       do_quotactl+0xaff/0xb30
       __x64_sys_quotactl+0x23a/0x4e0
       do_syscall_64+0x3b/0x90
       entry_SYSCALL_64_after_hwframe+0x44/0xae

other info that might help us debug this:

Chain exists of:
  &sb->s_type->i_mutex_key#18 --> &sbi->cp_rwsem --> &sbi->quota_sem

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(&sbi->quota_sem);
                               lock(&sbi->cp_rwsem);
                               lock(&sbi->quota_sem);
  lock(&sb->s_type->i_mutex_key#18);

 *** DEADLOCK ***

3 locks held by repquota/8606:
 #0: ffff88801efac0e0 (&type->s_umount_key#53){++++}-{3:3}, at: user_get_super+0xd9/0x190
 #1: ffff8880084bc380 (&sbi->cp_rwsem){++++}-{3:3}, at: f2fs_quota_sync+0x3e/0x300 [f2fs]
 #2: ffff8880084bcde8 (&sbi->quota_sem){.+.+}-{3:3}, at: f2fs_quota_sync+0x59/0x300 [f2fs]

stack backtrace:
CPU: 6 PID: 8606 Comm: repquota Not tainted 5.14.0-rc1 #69
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
Call Trace:
 dump_stack_lvl+0xce/0x134
 dump_stack+0x17/0x20
 print_circular_bug.isra.0.cold+0x239/0x253
 check_noncircular+0x1be/0x1f0
 check_prev_add+0xdc/0xb30
 validate_chain+0xa67/0xb20
 __lock_acquire+0x648/0x10b0
 lock_acquire+0x128/0x470
 down_write+0x39/0xc0
 f2fs_quota_sync+0x207/0x300 [f2fs]
 do_quotactl+0xaff/0xb30
 __x64_sys_quotactl+0x23a/0x4e0
 do_syscall_64+0x3b/0x90
 entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x7f883b0b4efe

The root cause is ABBA deadlock of inode lock and cp_rwsem,
reorder locks in f2fs_quota_sync() as below to fix this issue:
- lock inode
- lock cp_rwsem
- lock quota_sem

Fixes: db6ec53b7e ("f2fs: add a rw_sem to cover quota flag changes")
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-07-19 18:05:00 -07:00
Jaegeuk Kim
4cb1eb81bc f2fs: let's keep writing IOs on SBI_NEED_FSCK
SBI_NEED_FSCK is an indicator that fsck.f2fs needs to be triggered, so it
is not fully critical to stop any IO writes. So, let's allow to write data
instead of reporting EIO forever given SBI_NEED_FSCK, but do keep OPU.

Fixes: 9557727876 ("f2fs: drop inplace IO if fs status is abnormal")
Cc: <stable@kernel.org> # v5.13+
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-07-19 18:04:59 -07:00
Jia Yang
ca329be902 f2fs: Revert "f2fs: Fix indefinite loop in f2fs_gc() v1"
This reverts commit 957fa47823.

The patch "f2fs: Fix indefinite loop in f2fs_gc()" v1 and v4 are all
merged. Patch v4 is test info for patch v1. Patch v1 doesn't work and
may cause that sbi->cur_victim_sec can't be resetted to NULL_SEGNO,
which makes SSR unable to get segment of sbi->cur_victim_sec.
So it should be reverted.

The mails record:
[1] https://lore.kernel.org/linux-f2fs-devel/7288dcd4-b168-7656-d1af-7e2cafa4f720@huawei.com/T/
[2] https://lore.kernel.org/linux-f2fs-devel/20190809153653.GD93481@jaegeuk-macbookpro.roam.corp.google.com/T/

Signed-off-by: Jia Yang <jiayang5@huawei.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-07-19 18:04:58 -07:00
Wang Xiaojun
3b12f276ea f2fs: avoid to create an empty string as the extension_list
When creating a file, we need to set the temperature based on
extension_list. If the empty string is a valid extension_list,
the is_extension_exist will always returns true,
which affects the separation of hot and cold.

Signed-off-by: Wang Xiaojun <wangxiaojun11@huawei.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-07-13 16:52:08 -07:00
Chao Yu
5fd8bddaf0 f2fs: compress: fix to set zstd compress level correctly
As 5kft reported in [1]:

set_compress_context() should set compress level into .i_compress_flag
for zstd as well as lz4hc, otherwise, zstd compressor will still use
default zstd compress level during compression, fix it.

[1] https://lore.kernel.org/linux-f2fs-devel/8e29f52b-6b0d-45ec-9520-e63eb254287a@www.fastmail.com/T/#u

Fixes: 3fde13f817 ("f2fs: compress: support compress level")
Reported-by: 5kft <5kft@5kft.org>
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-07-13 16:52:07 -07:00
Daeho Jeong
4479c16890 f2fs: add sysfs nodes to get GC info for each GC mode
Added gc_reclaimed_segments and gc_segment_mode sysfs nodes.
1) "gc_reclaimed_segments" shows how many segments have been
reclaimed by GC during a specific GC mode.
2) "gc_segment_mode" is used to control for which gc mode
the "gc_reclaimed_segments" node shows.

Signed-off-by: Daeho Jeong <daehojeong@google.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-07-13 16:52:07 -07:00
Jaegeuk Kim
bc961aee9e f2fs: drop dirty node pages when cp is in error status
Otherwise, writeback is going to fall in a loop to flush dirty inode forever
before getting SBI_CLOSING.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-07-06 22:06:23 -07:00
Jaegeuk Kim
30c2a11b57 f2fs: initialize page->private when using for our internal use
We need to guarantee it's initially zero. Otherwise, it'll hurt entire flag
operations.

Fixes: b763f3bedc ("f2fs: restructure f2fs page.private layout")
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-07-04 22:20:02 -07:00
Fengnan Chang
0155e967e0 f2fs: compress: add nocompress extensions support
When we create a directory with enable compression, all file write into
directory will try to compress.But sometimes we may know, new file
cannot meet compression ratio requirements.
We need a nocompress extension to skip those files to avoid unnecessary
compress page test.

After add nocompress_extension, the priority should be:
dir_flag < comp_extention,nocompress_extension < comp_file_flag,
no_comp_file_flag.

Priority in between FS_COMPR_FL, FS_NOCOMP_FS, extensions:
   * compress_extension=so; nocompress_extension=zip; chattr +c dir;
     touch dir/foo.so; touch dir/bar.zip; touch dir/baz.txt; then foo.so
     and baz.txt should be compresse, bar.zip should be non-compressed.
     chattr +c dir/bar.zip can enable compress on bar.zip.
   * compress_extension=so; nocompress_extension=zip; chattr -c dir;
     touch dir/foo.so; touch dir/bar.zip; touch dir/baz.txt; then foo.so
     should be compresse, bar.zip and baz.txt should be non-compressed.
     chattr+c dir/bar.zip; chattr+c dir/baz.txt; can enable compress on
     bar.zip and baz.txt.

Signed-off-by: Fengnan Chang <changfengnan@vivo.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-07-01 10:25:51 -07:00
Jaegeuk Kim
bb51a33182 Revert "f2fs: avoid attaching SB_ACTIVE flag during mount/remount"
This reverts commit 42bbf0bcc2.
2021-06-23 01:42:05 -07:00
Jaegeuk Kim
c81ac64da1 f2fs: remove false alarm on iget failure during GC
This patch removes setting SBI_NEED_FSCK when GC gets an error on f2fs_iget,
since f2fs_iget can give ENOMEM and others by race condition.
If we set this critical fsck flag, we'll get EIO during fsync via the below
code path.

In f2fs_inplace_write_data(),

	if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) || f2fs_cp_error(sbi)) {
		err = -EIO;
		goto drop_bio;
	}

Fixes: 9557727876 ("f2fs: drop inplace IO if fs status is abnormal")
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-06-23 01:41:57 -07:00
Daeho Jeong
cdeff03989 f2fs: enable extent cache for compression files in read-only
Let's allow extent cache for RO partition.

Signed-off-by: Daeho Jeong <daehojeong@google.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-06-21 07:24:26 -07:00
Chao Yu
15a475975e f2fs: fix to avoid adding tab before doc section
Otherwise whole section after tab will be invisible in compiled
html format document.

Cc: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Fixes: 89272ca110 ("docs: filesystems: convert f2fs.txt to ReST")
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-06-21 07:24:25 -07:00
Chao Yu
44e0be85eb f2fs: introduce f2fs_casefolded_name slab cache
Add a slab cache: "f2fs_casefolded_name" for memory allocation
of casefold name.

Reviewed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-06-21 07:24:24 -07:00
Chao Yu
eaef955b91 f2fs: swap: support migrating swapfile in aligned write mode
This patch supports to migrate swapfile in aligned write mode during
swapon in order to keep swapfile being aligned to section as much as
possible, then pinned swapfile will locates fully filled section which
may not affected by GC.

However, for the case that swapfile's size is not aligned to section
size, it will still leave last extent in file's tail as unaligned due
to its size is smaller than section size, like case #2.

case #1
xfs_io -f /mnt/f2fs/file -c "pwrite 0 4M" -c "fsync"

Before swapon:
 EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
   0: [0..3047]:       1123352..1126399  3048 0x1000
   1: [3048..7143]:    237568..241663    4096 0x1000
   2: [7144..8191]:    245760..246807    1048 0x1001
After swapon:
 EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
   0: [0..8191]:       249856..258047    8192 0x1001
Kmsg:
F2FS-fs (zram0): Swapfile (2) is not align to section:
1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(2097152 * n)

case #2
xfs_io -f /mnt/f2fs/file -c "pwrite 0 3M" -c "fsync"

Before swapon:
 EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
   0: [0..3047]:       246808..249855    3048 0x1000
   1: [3048..6143]:    237568..240663    3096 0x1001
After swapon:
 EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
   0: [0..4095]:       258048..262143    4096 0x1000
   1: [4096..6143]:    238616..240663    2048 0x1001
Kmsg:
F2FS-fs (zram0): Swapfile: last extent is not aligned to section
F2FS-fs (zram0): Swapfile (2) is not align to section:
1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(2097152 * n)

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-06-07 10:21:38 -07:00