mirror of
https://github.com/hardkernel/linux.git
synced 2026-06-06 10:58:48 +09:00
Merge 5.15.37 into android14-5.15
Changes in 5.15.37
floppy: disable FDRAWCMD by default
bpf: Introduce composable reg, ret and arg types.
bpf: Replace ARG_XXX_OR_NULL with ARG_XXX | PTR_MAYBE_NULL
bpf: Replace RET_XXX_OR_NULL with RET_XXX | PTR_MAYBE_NULL
bpf: Replace PTR_TO_XXX_OR_NULL with PTR_TO_XXX | PTR_MAYBE_NULL
bpf: Introduce MEM_RDONLY flag
bpf: Convert PTR_TO_MEM_OR_NULL to composable types.
bpf: Make per_cpu_ptr return rdonly PTR_TO_MEM.
bpf: Add MEM_RDONLY for helper args that are pointers to rdonly mem.
bpf/selftests: Test PTR_TO_RDONLY_MEM
bpf: Fix crash due to out of bounds access into reg2btf_ids.
spi: cadence-quadspi: fix write completion support
ARM: dts: socfpga: change qspi to "intel,socfpga-qspi"
mm: kfence: fix objcgs vector allocation
gup: Turn fault_in_pages_{readable,writeable} into fault_in_{readable,writeable}
iov_iter: Turn iov_iter_fault_in_readable into fault_in_iov_iter_readable
iov_iter: Introduce fault_in_iov_iter_writeable
gfs2: Add wrapper for iomap_file_buffered_write
gfs2: Clean up function may_grant
gfs2: Introduce flag for glock holder auto-demotion
gfs2: Move the inode glock locking to gfs2_file_buffered_write
gfs2: Eliminate ip->i_gh
gfs2: Fix mmap + page fault deadlocks for buffered I/O
iomap: Fix iomap_dio_rw return value for user copies
iomap: Support partial direct I/O on user copy failures
iomap: Add done_before argument to iomap_dio_rw
gup: Introduce FOLL_NOFAULT flag to disable page faults
iov_iter: Introduce nofault flag to disable page faults
gfs2: Fix mmap + page fault deadlocks for direct I/O
btrfs: fix deadlock due to page faults during direct IO reads and writes
btrfs: fallback to blocking mode when doing async dio over multiple extents
mm: gup: make fault_in_safe_writeable() use fixup_user_fault()
selftests/bpf: Add test for reg2btf_ids out of bounds access
Linux 5.15.37
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
Change-Id: I785543e252f972c5a86f313e4b6721e2ff0797e6
This commit is contained in:
2
Makefile
2
Makefile
@@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
VERSION = 5
|
||||
PATCHLEVEL = 15
|
||||
SUBLEVEL = 36
|
||||
SUBLEVEL = 37
|
||||
EXTRAVERSION =
|
||||
NAME = Trick or Treat
|
||||
|
||||
|
||||
@@ -782,7 +782,7 @@
|
||||
};
|
||||
|
||||
qspi: spi@ff705000 {
|
||||
compatible = "cdns,qspi-nor";
|
||||
compatible = "intel,socfpga-qspi", "cdns,qspi-nor";
|
||||
#address-cells = <1>;
|
||||
#size-cells = <0>;
|
||||
reg = <0xff705000 0x1000>,
|
||||
|
||||
@@ -756,7 +756,7 @@
|
||||
};
|
||||
|
||||
qspi: spi@ff809000 {
|
||||
compatible = "cdns,qspi-nor";
|
||||
compatible = "intel,socfpga-qspi", "cdns,qspi-nor";
|
||||
#address-cells = <1>;
|
||||
#size-cells = <0>;
|
||||
reg = <0xff809000 0x100>,
|
||||
|
||||
@@ -594,7 +594,7 @@
|
||||
};
|
||||
|
||||
qspi: spi@ff8d2000 {
|
||||
compatible = "cdns,qspi-nor";
|
||||
compatible = "intel,socfpga-qspi", "cdns,qspi-nor";
|
||||
#address-cells = <1>;
|
||||
#size-cells = <0>;
|
||||
reg = <0xff8d2000 0x100>,
|
||||
|
||||
@@ -628,7 +628,7 @@
|
||||
};
|
||||
|
||||
qspi: spi@ff8d2000 {
|
||||
compatible = "cdns,qspi-nor";
|
||||
compatible = "intel,socfpga-qspi", "cdns,qspi-nor";
|
||||
#address-cells = <1>;
|
||||
#size-cells = <0>;
|
||||
reg = <0xff8d2000 0x100>,
|
||||
|
||||
@@ -669,7 +669,8 @@ static void __init kvm_use_magic_page(void)
|
||||
on_each_cpu(kvm_map_magic_page, &features, 1);
|
||||
|
||||
/* Quick self-test to see if the mapping works */
|
||||
if (fault_in_pages_readable((const char *)KVM_MAGIC_PAGE, sizeof(u32))) {
|
||||
if (fault_in_readable((const char __user *)KVM_MAGIC_PAGE,
|
||||
sizeof(u32))) {
|
||||
kvm_patching_worked = false;
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1048,7 +1048,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
|
||||
if (new_ctx == NULL)
|
||||
return 0;
|
||||
if (!access_ok(new_ctx, ctx_size) ||
|
||||
fault_in_pages_readable((u8 __user *)new_ctx, ctx_size))
|
||||
fault_in_readable((char __user *)new_ctx, ctx_size))
|
||||
return -EFAULT;
|
||||
|
||||
/*
|
||||
@@ -1239,7 +1239,7 @@ SYSCALL_DEFINE3(debug_setcontext, struct ucontext __user *, ctx,
|
||||
#endif
|
||||
|
||||
if (!access_ok(ctx, sizeof(*ctx)) ||
|
||||
fault_in_pages_readable((u8 __user *)ctx, sizeof(*ctx)))
|
||||
fault_in_readable((char __user *)ctx, sizeof(*ctx)))
|
||||
return -EFAULT;
|
||||
|
||||
/*
|
||||
|
||||
@@ -688,7 +688,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
|
||||
if (new_ctx == NULL)
|
||||
return 0;
|
||||
if (!access_ok(new_ctx, ctx_size) ||
|
||||
fault_in_pages_readable((u8 __user *)new_ctx, ctx_size))
|
||||
fault_in_readable((char __user *)new_ctx, ctx_size))
|
||||
return -EFAULT;
|
||||
|
||||
/*
|
||||
|
||||
@@ -205,7 +205,7 @@ retry:
|
||||
fpregs_unlock();
|
||||
|
||||
if (ret) {
|
||||
if (!fault_in_pages_writeable(buf_fx, fpu_user_xstate_size))
|
||||
if (!fault_in_writeable(buf_fx, fpu_user_xstate_size))
|
||||
goto retry;
|
||||
return -EFAULT;
|
||||
}
|
||||
@@ -278,10 +278,9 @@ retry:
|
||||
if (ret != -EFAULT)
|
||||
return -EINVAL;
|
||||
|
||||
ret = fault_in_pages_readable(buf, size);
|
||||
if (!ret)
|
||||
if (!fault_in_readable(buf, size))
|
||||
goto retry;
|
||||
return ret;
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -33,6 +33,22 @@ config BLK_DEV_FD
|
||||
To compile this driver as a module, choose M here: the
|
||||
module will be called floppy.
|
||||
|
||||
config BLK_DEV_FD_RAWCMD
|
||||
bool "Support for raw floppy disk commands (DEPRECATED)"
|
||||
depends on BLK_DEV_FD
|
||||
help
|
||||
If you want to use actual physical floppies and expect to do
|
||||
special low-level hardware accesses to them (access and use
|
||||
non-standard formats, for example), then enable this.
|
||||
|
||||
Note that the code enabled by this option is rarely used and
|
||||
might be unstable or insecure, and distros should not enable it.
|
||||
|
||||
Note: FDRAWCMD is deprecated and will be removed from the kernel
|
||||
in the near future.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config AMIGA_FLOPPY
|
||||
tristate "Amiga floppy support"
|
||||
depends on AMIGA
|
||||
|
||||
@@ -2984,6 +2984,8 @@ static const char *drive_name(int type, int drive)
|
||||
return "(null)";
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_FD_RAWCMD
|
||||
|
||||
/* raw commands */
|
||||
static void raw_cmd_done(int flag)
|
||||
{
|
||||
@@ -3183,6 +3185,35 @@ static int raw_cmd_ioctl(int cmd, void __user *param)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int floppy_raw_cmd_ioctl(int type, int drive, int cmd,
|
||||
void __user *param)
|
||||
{
|
||||
int ret;
|
||||
|
||||
pr_warn_once("Note: FDRAWCMD is deprecated and will be removed from the kernel in the near future.\n");
|
||||
|
||||
if (type)
|
||||
return -EINVAL;
|
||||
if (lock_fdc(drive))
|
||||
return -EINTR;
|
||||
set_floppy(drive);
|
||||
ret = raw_cmd_ioctl(cmd, param);
|
||||
if (ret == -EINTR)
|
||||
return -EINTR;
|
||||
process_fd_request();
|
||||
return ret;
|
||||
}
|
||||
|
||||
#else /* CONFIG_BLK_DEV_FD_RAWCMD */
|
||||
|
||||
static int floppy_raw_cmd_ioctl(int type, int drive, int cmd,
|
||||
void __user *param)
|
||||
{
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static int invalidate_drive(struct block_device *bdev)
|
||||
{
|
||||
/* invalidate the buffer track to force a reread */
|
||||
@@ -3371,7 +3402,6 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
|
||||
{
|
||||
int drive = (long)bdev->bd_disk->private_data;
|
||||
int type = ITYPE(drive_state[drive].fd_device);
|
||||
int i;
|
||||
int ret;
|
||||
int size;
|
||||
union inparam {
|
||||
@@ -3522,16 +3552,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
|
||||
outparam = &write_errors[drive];
|
||||
break;
|
||||
case FDRAWCMD:
|
||||
if (type)
|
||||
return -EINVAL;
|
||||
if (lock_fdc(drive))
|
||||
return -EINTR;
|
||||
set_floppy(drive);
|
||||
i = raw_cmd_ioctl(cmd, (void __user *)param);
|
||||
if (i == -EINTR)
|
||||
return -EINTR;
|
||||
process_fd_request();
|
||||
return i;
|
||||
return floppy_raw_cmd_ioctl(type, drive, cmd, (void __user *)param);
|
||||
case FDTWADDLE:
|
||||
if (lock_fdc(drive))
|
||||
return -EINTR;
|
||||
|
||||
@@ -336,7 +336,7 @@ int armada_gem_pwrite_ioctl(struct drm_device *dev, void *data,
|
||||
struct drm_armada_gem_pwrite *args = data;
|
||||
struct armada_gem_object *dobj;
|
||||
char __user *ptr;
|
||||
int ret;
|
||||
int ret = 0;
|
||||
|
||||
DRM_DEBUG_DRIVER("handle %u off %u size %u ptr 0x%llx\n",
|
||||
args->handle, args->offset, args->size, args->ptr);
|
||||
@@ -349,9 +349,8 @@ int armada_gem_pwrite_ioctl(struct drm_device *dev, void *data,
|
||||
if (!access_ok(ptr, args->size))
|
||||
return -EFAULT;
|
||||
|
||||
ret = fault_in_pages_readable(ptr, args->size);
|
||||
if (ret)
|
||||
return ret;
|
||||
if (fault_in_readable(ptr, args->size))
|
||||
return -EFAULT;
|
||||
|
||||
dobj = armada_gem_object_lookup(file, args->handle);
|
||||
if (dobj == NULL)
|
||||
|
||||
@@ -36,6 +36,7 @@
|
||||
/* Quirks */
|
||||
#define CQSPI_NEEDS_WR_DELAY BIT(0)
|
||||
#define CQSPI_DISABLE_DAC_MODE BIT(1)
|
||||
#define CQSPI_NO_SUPPORT_WR_COMPLETION BIT(3)
|
||||
|
||||
/* Capabilities */
|
||||
#define CQSPI_SUPPORTS_OCTAL BIT(0)
|
||||
@@ -83,6 +84,7 @@ struct cqspi_st {
|
||||
u32 wr_delay;
|
||||
bool use_direct_mode;
|
||||
struct cqspi_flash_pdata f_pdata[CQSPI_MAX_CHIPSELECT];
|
||||
bool wr_completion;
|
||||
};
|
||||
|
||||
struct cqspi_driver_platdata {
|
||||
@@ -797,9 +799,11 @@ static int cqspi_write_setup(struct cqspi_flash_pdata *f_pdata,
|
||||
* polling on the controller's side. spinand and spi-nor will take
|
||||
* care of polling the status register.
|
||||
*/
|
||||
reg = readl(reg_base + CQSPI_REG_WR_COMPLETION_CTRL);
|
||||
reg |= CQSPI_REG_WR_DISABLE_AUTO_POLL;
|
||||
writel(reg, reg_base + CQSPI_REG_WR_COMPLETION_CTRL);
|
||||
if (cqspi->wr_completion) {
|
||||
reg = readl(reg_base + CQSPI_REG_WR_COMPLETION_CTRL);
|
||||
reg |= CQSPI_REG_WR_DISABLE_AUTO_POLL;
|
||||
writel(reg, reg_base + CQSPI_REG_WR_COMPLETION_CTRL);
|
||||
}
|
||||
|
||||
reg = readl(reg_base + CQSPI_REG_SIZE);
|
||||
reg &= ~CQSPI_REG_SIZE_ADDRESS_MASK;
|
||||
@@ -1532,6 +1536,10 @@ static int cqspi_probe(struct platform_device *pdev)
|
||||
|
||||
cqspi->master_ref_clk_hz = clk_get_rate(cqspi->clk);
|
||||
master->max_speed_hz = cqspi->master_ref_clk_hz;
|
||||
|
||||
/* write completion is supported by default */
|
||||
cqspi->wr_completion = true;
|
||||
|
||||
ddata = of_device_get_match_data(dev);
|
||||
if (ddata) {
|
||||
if (ddata->quirks & CQSPI_NEEDS_WR_DELAY)
|
||||
@@ -1541,6 +1549,8 @@ static int cqspi_probe(struct platform_device *pdev)
|
||||
master->mode_bits |= SPI_RX_OCTAL | SPI_TX_OCTAL;
|
||||
if (!(ddata->quirks & CQSPI_DISABLE_DAC_MODE))
|
||||
cqspi->use_direct_mode = true;
|
||||
if (ddata->quirks & CQSPI_NO_SUPPORT_WR_COMPLETION)
|
||||
cqspi->wr_completion = false;
|
||||
}
|
||||
|
||||
ret = devm_request_irq(dev, irq, cqspi_irq_handler, 0,
|
||||
@@ -1649,6 +1659,10 @@ static const struct cqspi_driver_platdata intel_lgm_qspi = {
|
||||
.quirks = CQSPI_DISABLE_DAC_MODE,
|
||||
};
|
||||
|
||||
static const struct cqspi_driver_platdata socfpga_qspi = {
|
||||
.quirks = CQSPI_NO_SUPPORT_WR_COMPLETION,
|
||||
};
|
||||
|
||||
static const struct of_device_id cqspi_dt_ids[] = {
|
||||
{
|
||||
.compatible = "cdns,qspi-nor",
|
||||
@@ -1666,6 +1680,10 @@ static const struct of_device_id cqspi_dt_ids[] = {
|
||||
.compatible = "intel,lgm-qspi",
|
||||
.data = &intel_lgm_qspi,
|
||||
},
|
||||
{
|
||||
.compatible = "intel,socfpga-qspi",
|
||||
.data = (void *)&socfpga_qspi,
|
||||
},
|
||||
{ /* end of table */ }
|
||||
};
|
||||
|
||||
|
||||
141
fs/btrfs/file.c
141
fs/btrfs/file.c
@@ -1709,7 +1709,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
|
||||
* Fault pages before locking them in prepare_pages
|
||||
* to avoid recursive lock
|
||||
*/
|
||||
if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
|
||||
if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
|
||||
ret = -EFAULT;
|
||||
break;
|
||||
}
|
||||
@@ -1903,16 +1903,17 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
|
||||
|
||||
static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
|
||||
{
|
||||
const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC);
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct inode *inode = file_inode(file);
|
||||
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
||||
loff_t pos;
|
||||
ssize_t written = 0;
|
||||
ssize_t written_buffered;
|
||||
size_t prev_left = 0;
|
||||
loff_t endbyte;
|
||||
ssize_t err;
|
||||
unsigned int ilock_flags = 0;
|
||||
struct iomap_dio *dio = NULL;
|
||||
|
||||
if (iocb->ki_flags & IOCB_NOWAIT)
|
||||
ilock_flags |= BTRFS_ILOCK_TRY;
|
||||
@@ -1955,23 +1956,80 @@ relock:
|
||||
goto buffered;
|
||||
}
|
||||
|
||||
dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
|
||||
0, 0);
|
||||
/*
|
||||
* We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw()
|
||||
* calls generic_write_sync() (through iomap_dio_complete()), because
|
||||
* that results in calling fsync (btrfs_sync_file()) which will try to
|
||||
* lock the inode in exclusive/write mode.
|
||||
*/
|
||||
if (is_sync_write)
|
||||
iocb->ki_flags &= ~IOCB_DSYNC;
|
||||
|
||||
/*
|
||||
* The iov_iter can be mapped to the same file range we are writing to.
|
||||
* If that's the case, then we will deadlock in the iomap code, because
|
||||
* it first calls our callback btrfs_dio_iomap_begin(), which will create
|
||||
* an ordered extent, and after that it will fault in the pages that the
|
||||
* iov_iter refers to. During the fault in we end up in the readahead
|
||||
* pages code (starting at btrfs_readahead()), which will lock the range,
|
||||
* find that ordered extent and then wait for it to complete (at
|
||||
* btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
|
||||
* obviously the ordered extent can never complete as we didn't submit
|
||||
* yet the respective bio(s). This always happens when the buffer is
|
||||
* memory mapped to the same file range, since the iomap DIO code always
|
||||
* invalidates pages in the target file range (after starting and waiting
|
||||
* for any writeback).
|
||||
*
|
||||
* So here we disable page faults in the iov_iter and then retry if we
|
||||
* got -EFAULT, faulting in the pages before the retry.
|
||||
*/
|
||||
again:
|
||||
from->nofault = true;
|
||||
err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
|
||||
IOMAP_DIO_PARTIAL, written);
|
||||
from->nofault = false;
|
||||
|
||||
/* No increment (+=) because iomap returns a cumulative value. */
|
||||
if (err > 0)
|
||||
written = err;
|
||||
|
||||
if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) {
|
||||
const size_t left = iov_iter_count(from);
|
||||
/*
|
||||
* We have more data left to write. Try to fault in as many as
|
||||
* possible of the remainder pages and retry. We do this without
|
||||
* releasing and locking again the inode, to prevent races with
|
||||
* truncate.
|
||||
*
|
||||
* Also, in case the iov refers to pages in the file range of the
|
||||
* file we want to write to (due to a mmap), we could enter an
|
||||
* infinite loop if we retry after faulting the pages in, since
|
||||
* iomap will invalidate any pages in the range early on, before
|
||||
* it tries to fault in the pages of the iov. So we keep track of
|
||||
* how much was left of iov in the previous EFAULT and fallback
|
||||
* to buffered IO in case we haven't made any progress.
|
||||
*/
|
||||
if (left == prev_left) {
|
||||
err = -ENOTBLK;
|
||||
} else {
|
||||
fault_in_iov_iter_readable(from, left);
|
||||
prev_left = left;
|
||||
goto again;
|
||||
}
|
||||
}
|
||||
|
||||
btrfs_inode_unlock(inode, ilock_flags);
|
||||
|
||||
if (IS_ERR_OR_NULL(dio)) {
|
||||
err = PTR_ERR_OR_ZERO(dio);
|
||||
if (err < 0 && err != -ENOTBLK)
|
||||
goto out;
|
||||
} else {
|
||||
written = iomap_dio_complete(dio);
|
||||
}
|
||||
/*
|
||||
* Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do
|
||||
* the fsync (call generic_write_sync()).
|
||||
*/
|
||||
if (is_sync_write)
|
||||
iocb->ki_flags |= IOCB_DSYNC;
|
||||
|
||||
if (written < 0 || !iov_iter_count(from)) {
|
||||
err = written;
|
||||
/* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */
|
||||
if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
|
||||
goto out;
|
||||
}
|
||||
|
||||
buffered:
|
||||
pos = iocb->ki_pos;
|
||||
@@ -1996,7 +2054,7 @@ buffered:
|
||||
invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
|
||||
endbyte >> PAGE_SHIFT);
|
||||
out:
|
||||
return written ? written : err;
|
||||
return err < 0 ? err : written;
|
||||
}
|
||||
|
||||
static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
|
||||
@@ -3660,6 +3718,8 @@ static int check_direct_read(struct btrfs_fs_info *fs_info,
|
||||
static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
|
||||
{
|
||||
struct inode *inode = file_inode(iocb->ki_filp);
|
||||
size_t prev_left = 0;
|
||||
ssize_t read = 0;
|
||||
ssize_t ret;
|
||||
|
||||
if (fsverity_active(inode))
|
||||
@@ -3669,10 +3729,57 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
|
||||
return 0;
|
||||
|
||||
btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
|
||||
again:
|
||||
/*
|
||||
* This is similar to what we do for direct IO writes, see the comment
|
||||
* at btrfs_direct_write(), but we also disable page faults in addition
|
||||
* to disabling them only at the iov_iter level. This is because when
|
||||
* reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
|
||||
* which can still trigger page fault ins despite having set ->nofault
|
||||
* to true of our 'to' iov_iter.
|
||||
*
|
||||
* The difference to direct IO writes is that we deadlock when trying
|
||||
* to lock the extent range in the inode's tree during he page reads
|
||||
* triggered by the fault in (while for writes it is due to waiting for
|
||||
* our own ordered extent). This is because for direct IO reads,
|
||||
* btrfs_dio_iomap_begin() returns with the extent range locked, which
|
||||
* is only unlocked in the endio callback (end_bio_extent_readpage()).
|
||||
*/
|
||||
pagefault_disable();
|
||||
to->nofault = true;
|
||||
ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
|
||||
0, 0);
|
||||
IOMAP_DIO_PARTIAL, read);
|
||||
to->nofault = false;
|
||||
pagefault_enable();
|
||||
|
||||
/* No increment (+=) because iomap returns a cumulative value. */
|
||||
if (ret > 0)
|
||||
read = ret;
|
||||
|
||||
if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
|
||||
const size_t left = iov_iter_count(to);
|
||||
|
||||
if (left == prev_left) {
|
||||
/*
|
||||
* We didn't make any progress since the last attempt,
|
||||
* fallback to a buffered read for the remainder of the
|
||||
* range. This is just to avoid any possibility of looping
|
||||
* for too long.
|
||||
*/
|
||||
ret = read;
|
||||
} else {
|
||||
/*
|
||||
* We made some progress since the last retry or this is
|
||||
* the first time we are retrying. Fault in as many pages
|
||||
* as possible and retry.
|
||||
*/
|
||||
fault_in_iov_iter_writeable(to, left);
|
||||
prev_left = left;
|
||||
goto again;
|
||||
}
|
||||
}
|
||||
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
|
||||
return ret;
|
||||
return ret < 0 ? ret : read;
|
||||
}
|
||||
|
||||
static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
|
||||
|
||||
@@ -7961,6 +7961,34 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
|
||||
}
|
||||
|
||||
len = min(len, em->len - (start - em->start));
|
||||
|
||||
/*
|
||||
* If we have a NOWAIT request and the range contains multiple extents
|
||||
* (or a mix of extents and holes), then we return -EAGAIN to make the
|
||||
* caller fallback to a context where it can do a blocking (without
|
||||
* NOWAIT) request. This way we avoid doing partial IO and returning
|
||||
* success to the caller, which is not optimal for writes and for reads
|
||||
* it can result in unexpected behaviour for an application.
|
||||
*
|
||||
* When doing a read, because we use IOMAP_DIO_PARTIAL when calling
|
||||
* iomap_dio_rw(), we can end up returning less data then what the caller
|
||||
* asked for, resulting in an unexpected, and incorrect, short read.
|
||||
* That is, the caller asked to read N bytes and we return less than that,
|
||||
* which is wrong unless we are crossing EOF. This happens if we get a
|
||||
* page fault error when trying to fault in pages for the buffer that is
|
||||
* associated to the struct iov_iter passed to iomap_dio_rw(), and we
|
||||
* have previously submitted bios for other extents in the range, in
|
||||
* which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
|
||||
* those bios have completed by the time we get the page fault error,
|
||||
* which we return back to our caller - we should only return EIOCBQUEUED
|
||||
* after we have submitted bios for all the extents in the range.
|
||||
*/
|
||||
if ((flags & IOMAP_NOWAIT) && len < length) {
|
||||
free_extent_map(em);
|
||||
ret = -EAGAIN;
|
||||
goto unlock_err;
|
||||
}
|
||||
|
||||
if (write) {
|
||||
ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
|
||||
start, len);
|
||||
|
||||
@@ -2258,9 +2258,8 @@ static noinline int search_ioctl(struct inode *inode,
|
||||
key.offset = sk->min_offset;
|
||||
|
||||
while (1) {
|
||||
ret = fault_in_pages_writeable(ubuf + sk_offset,
|
||||
*buf_size - sk_offset);
|
||||
if (ret)
|
||||
ret = -EFAULT;
|
||||
if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset))
|
||||
break;
|
||||
|
||||
ret = btrfs_search_forward(root, &key, path, sk->min_transid);
|
||||
|
||||
@@ -4414,7 +4414,7 @@ static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter,
|
||||
return 0;
|
||||
|
||||
/* If it will be a short write, don't bother. */
|
||||
if (iov_iter_fault_in_readable(iter, count))
|
||||
if (fault_in_iov_iter_readable(iter, count))
|
||||
return 0;
|
||||
|
||||
if (f2fs_has_inline_data(inode)) {
|
||||
|
||||
@@ -1166,7 +1166,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
|
||||
|
||||
again:
|
||||
err = -EFAULT;
|
||||
if (iov_iter_fault_in_readable(ii, bytes))
|
||||
if (fault_in_iov_iter_readable(ii, bytes))
|
||||
break;
|
||||
|
||||
err = -ENOMEM;
|
||||
|
||||
@@ -961,46 +961,6 @@ hole_found:
|
||||
goto out;
|
||||
}
|
||||
|
||||
static int gfs2_write_lock(struct inode *inode)
|
||||
{
|
||||
struct gfs2_inode *ip = GFS2_I(inode);
|
||||
struct gfs2_sbd *sdp = GFS2_SB(inode);
|
||||
int error;
|
||||
|
||||
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
|
||||
error = gfs2_glock_nq(&ip->i_gh);
|
||||
if (error)
|
||||
goto out_uninit;
|
||||
if (&ip->i_inode == sdp->sd_rindex) {
|
||||
struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
|
||||
|
||||
error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
|
||||
GL_NOCACHE, &m_ip->i_gh);
|
||||
if (error)
|
||||
goto out_unlock;
|
||||
}
|
||||
return 0;
|
||||
|
||||
out_unlock:
|
||||
gfs2_glock_dq(&ip->i_gh);
|
||||
out_uninit:
|
||||
gfs2_holder_uninit(&ip->i_gh);
|
||||
return error;
|
||||
}
|
||||
|
||||
static void gfs2_write_unlock(struct inode *inode)
|
||||
{
|
||||
struct gfs2_inode *ip = GFS2_I(inode);
|
||||
struct gfs2_sbd *sdp = GFS2_SB(inode);
|
||||
|
||||
if (&ip->i_inode == sdp->sd_rindex) {
|
||||
struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
|
||||
|
||||
gfs2_glock_dq_uninit(&m_ip->i_gh);
|
||||
}
|
||||
gfs2_glock_dq_uninit(&ip->i_gh);
|
||||
}
|
||||
|
||||
static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos,
|
||||
unsigned len)
|
||||
{
|
||||
@@ -1118,11 +1078,6 @@ out_qunlock:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline bool gfs2_iomap_need_write_lock(unsigned flags)
|
||||
{
|
||||
return (flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT);
|
||||
}
|
||||
|
||||
static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
|
||||
unsigned flags, struct iomap *iomap,
|
||||
struct iomap *srcmap)
|
||||
@@ -1135,12 +1090,6 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
|
||||
iomap->flags |= IOMAP_F_BUFFER_HEAD;
|
||||
|
||||
trace_gfs2_iomap_start(ip, pos, length, flags);
|
||||
if (gfs2_iomap_need_write_lock(flags)) {
|
||||
ret = gfs2_write_lock(inode);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = __gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
@@ -1168,10 +1117,7 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
|
||||
ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp);
|
||||
|
||||
out_unlock:
|
||||
if (ret && gfs2_iomap_need_write_lock(flags))
|
||||
gfs2_write_unlock(inode);
|
||||
release_metapath(&mp);
|
||||
out:
|
||||
trace_gfs2_iomap_end(ip, iomap, ret);
|
||||
return ret;
|
||||
}
|
||||
@@ -1219,15 +1165,11 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
|
||||
}
|
||||
|
||||
if (unlikely(!written))
|
||||
goto out_unlock;
|
||||
return 0;
|
||||
|
||||
if (iomap->flags & IOMAP_F_SIZE_CHANGED)
|
||||
mark_inode_dirty(inode);
|
||||
set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
|
||||
|
||||
out_unlock:
|
||||
if (gfs2_iomap_need_write_lock(flags))
|
||||
gfs2_write_unlock(inode);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
252
fs/gfs2/file.c
252
fs/gfs2/file.c
@@ -777,27 +777,99 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
|
||||
return ret ? ret : ret1;
|
||||
}
|
||||
|
||||
static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i,
|
||||
size_t *prev_count,
|
||||
size_t *window_size)
|
||||
{
|
||||
char __user *p = i->iov[0].iov_base + i->iov_offset;
|
||||
size_t count = iov_iter_count(i);
|
||||
int pages = 1;
|
||||
|
||||
if (likely(!count))
|
||||
return false;
|
||||
if (ret <= 0 && ret != -EFAULT)
|
||||
return false;
|
||||
if (!iter_is_iovec(i))
|
||||
return false;
|
||||
|
||||
if (*prev_count != count || !*window_size) {
|
||||
int pages, nr_dirtied;
|
||||
|
||||
pages = min_t(int, BIO_MAX_VECS,
|
||||
DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE));
|
||||
nr_dirtied = max(current->nr_dirtied_pause -
|
||||
current->nr_dirtied, 1);
|
||||
pages = min(pages, nr_dirtied);
|
||||
}
|
||||
|
||||
*prev_count = count;
|
||||
*window_size = (size_t)PAGE_SIZE * pages - offset_in_page(p);
|
||||
return true;
|
||||
}
|
||||
|
||||
static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
|
||||
struct gfs2_holder *gh)
|
||||
{
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
|
||||
size_t count = iov_iter_count(to);
|
||||
size_t prev_count = 0, window_size = 0;
|
||||
size_t written = 0;
|
||||
ssize_t ret;
|
||||
|
||||
if (!count)
|
||||
/*
|
||||
* In this function, we disable page faults when we're holding the
|
||||
* inode glock while doing I/O. If a page fault occurs, we indicate
|
||||
* that the inode glock may be dropped, fault in the pages manually,
|
||||
* and retry.
|
||||
*
|
||||
* Unlike generic_file_read_iter, for reads, iomap_dio_rw can trigger
|
||||
* physical as well as manual page faults, and we need to disable both
|
||||
* kinds.
|
||||
*
|
||||
* For direct I/O, gfs2 takes the inode glock in deferred mode. This
|
||||
* locking mode is compatible with other deferred holders, so multiple
|
||||
* processes and nodes can do direct I/O to a file at the same time.
|
||||
* There's no guarantee that reads or writes will be atomic. Any
|
||||
* coordination among readers and writers needs to happen externally.
|
||||
*/
|
||||
|
||||
if (!iov_iter_count(to))
|
||||
return 0; /* skip atime */
|
||||
|
||||
gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
|
||||
retry:
|
||||
ret = gfs2_glock_nq(gh);
|
||||
if (ret)
|
||||
goto out_uninit;
|
||||
retry_under_glock:
|
||||
pagefault_disable();
|
||||
to->nofault = true;
|
||||
ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
|
||||
IOMAP_DIO_PARTIAL, written);
|
||||
to->nofault = false;
|
||||
pagefault_enable();
|
||||
if (ret > 0)
|
||||
written = ret;
|
||||
|
||||
ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0, 0);
|
||||
gfs2_glock_dq(gh);
|
||||
if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
|
||||
size_t leftover;
|
||||
|
||||
gfs2_holder_allow_demote(gh);
|
||||
leftover = fault_in_iov_iter_writeable(to, window_size);
|
||||
gfs2_holder_disallow_demote(gh);
|
||||
if (leftover != window_size) {
|
||||
if (!gfs2_holder_queued(gh))
|
||||
goto retry;
|
||||
goto retry_under_glock;
|
||||
}
|
||||
}
|
||||
if (gfs2_holder_queued(gh))
|
||||
gfs2_glock_dq(gh);
|
||||
out_uninit:
|
||||
gfs2_holder_uninit(gh);
|
||||
return ret;
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
return written;
|
||||
}
|
||||
|
||||
static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
|
||||
@@ -806,10 +878,20 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct inode *inode = file->f_mapping->host;
|
||||
struct gfs2_inode *ip = GFS2_I(inode);
|
||||
size_t len = iov_iter_count(from);
|
||||
loff_t offset = iocb->ki_pos;
|
||||
size_t prev_count = 0, window_size = 0;
|
||||
size_t read = 0;
|
||||
ssize_t ret;
|
||||
|
||||
/*
|
||||
* In this function, we disable page faults when we're holding the
|
||||
* inode glock while doing I/O. If a page fault occurs, we indicate
|
||||
* that the inode glock may be dropped, fault in the pages manually,
|
||||
* and retry.
|
||||
*
|
||||
* For writes, iomap_dio_rw only triggers manual page faults, so we
|
||||
* don't need to disable physical ones.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Deferred lock, even if its a write, since we do no allocation on
|
||||
* this path. All we need to change is the atime, and this lock mode
|
||||
@@ -819,31 +901,62 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
|
||||
* VFS does.
|
||||
*/
|
||||
gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
|
||||
retry:
|
||||
ret = gfs2_glock_nq(gh);
|
||||
if (ret)
|
||||
goto out_uninit;
|
||||
|
||||
retry_under_glock:
|
||||
/* Silently fall back to buffered I/O when writing beyond EOF */
|
||||
if (offset + len > i_size_read(&ip->i_inode))
|
||||
if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode))
|
||||
goto out;
|
||||
|
||||
ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0, 0);
|
||||
from->nofault = true;
|
||||
ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
|
||||
IOMAP_DIO_PARTIAL, read);
|
||||
from->nofault = false;
|
||||
|
||||
if (ret == -ENOTBLK)
|
||||
ret = 0;
|
||||
if (ret > 0)
|
||||
read = ret;
|
||||
|
||||
if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
|
||||
size_t leftover;
|
||||
|
||||
gfs2_holder_allow_demote(gh);
|
||||
leftover = fault_in_iov_iter_readable(from, window_size);
|
||||
gfs2_holder_disallow_demote(gh);
|
||||
if (leftover != window_size) {
|
||||
if (!gfs2_holder_queued(gh))
|
||||
goto retry;
|
||||
goto retry_under_glock;
|
||||
}
|
||||
}
|
||||
out:
|
||||
gfs2_glock_dq(gh);
|
||||
if (gfs2_holder_queued(gh))
|
||||
gfs2_glock_dq(gh);
|
||||
out_uninit:
|
||||
gfs2_holder_uninit(gh);
|
||||
return ret;
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
return read;
|
||||
}
|
||||
|
||||
static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
|
||||
{
|
||||
struct gfs2_inode *ip;
|
||||
struct gfs2_holder gh;
|
||||
size_t prev_count = 0, window_size = 0;
|
||||
size_t written = 0;
|
||||
ssize_t ret;
|
||||
|
||||
/*
|
||||
* In this function, we disable page faults when we're holding the
|
||||
* inode glock while doing I/O. If a page fault occurs, we indicate
|
||||
* that the inode glock may be dropped, fault in the pages manually,
|
||||
* and retry.
|
||||
*/
|
||||
|
||||
if (iocb->ki_flags & IOCB_DIRECT) {
|
||||
ret = gfs2_file_direct_read(iocb, to, &gh);
|
||||
if (likely(ret != -ENOTBLK))
|
||||
@@ -865,18 +978,118 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
|
||||
}
|
||||
ip = GFS2_I(iocb->ki_filp->f_mapping->host);
|
||||
gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
|
||||
retry:
|
||||
ret = gfs2_glock_nq(&gh);
|
||||
if (ret)
|
||||
goto out_uninit;
|
||||
retry_under_glock:
|
||||
pagefault_disable();
|
||||
ret = generic_file_read_iter(iocb, to);
|
||||
pagefault_enable();
|
||||
if (ret > 0)
|
||||
written += ret;
|
||||
gfs2_glock_dq(&gh);
|
||||
|
||||
if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
|
||||
size_t leftover;
|
||||
|
||||
gfs2_holder_allow_demote(&gh);
|
||||
leftover = fault_in_iov_iter_writeable(to, window_size);
|
||||
gfs2_holder_disallow_demote(&gh);
|
||||
if (leftover != window_size) {
|
||||
if (!gfs2_holder_queued(&gh)) {
|
||||
if (written)
|
||||
goto out_uninit;
|
||||
goto retry;
|
||||
}
|
||||
goto retry_under_glock;
|
||||
}
|
||||
}
|
||||
if (gfs2_holder_queued(&gh))
|
||||
gfs2_glock_dq(&gh);
|
||||
out_uninit:
|
||||
gfs2_holder_uninit(&gh);
|
||||
return written ? written : ret;
|
||||
}
|
||||
|
||||
static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,
|
||||
struct iov_iter *from,
|
||||
struct gfs2_holder *gh)
|
||||
{
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct inode *inode = file_inode(file);
|
||||
struct gfs2_inode *ip = GFS2_I(inode);
|
||||
struct gfs2_sbd *sdp = GFS2_SB(inode);
|
||||
struct gfs2_holder *statfs_gh = NULL;
|
||||
size_t prev_count = 0, window_size = 0;
|
||||
size_t read = 0;
|
||||
ssize_t ret;
|
||||
|
||||
/*
|
||||
* In this function, we disable page faults when we're holding the
|
||||
* inode glock while doing I/O. If a page fault occurs, we indicate
|
||||
* that the inode glock may be dropped, fault in the pages manually,
|
||||
* and retry.
|
||||
*/
|
||||
|
||||
if (inode == sdp->sd_rindex) {
|
||||
statfs_gh = kmalloc(sizeof(*statfs_gh), GFP_NOFS);
|
||||
if (!statfs_gh)
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh);
|
||||
retry:
|
||||
ret = gfs2_glock_nq(gh);
|
||||
if (ret)
|
||||
goto out_uninit;
|
||||
retry_under_glock:
|
||||
if (inode == sdp->sd_rindex) {
|
||||
struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
|
||||
|
||||
ret = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
|
||||
GL_NOCACHE, statfs_gh);
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
current->backing_dev_info = inode_to_bdi(inode);
|
||||
pagefault_disable();
|
||||
ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
|
||||
pagefault_enable();
|
||||
current->backing_dev_info = NULL;
|
||||
if (ret > 0) {
|
||||
iocb->ki_pos += ret;
|
||||
read += ret;
|
||||
}
|
||||
|
||||
if (inode == sdp->sd_rindex)
|
||||
gfs2_glock_dq_uninit(statfs_gh);
|
||||
|
||||
if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
|
||||
size_t leftover;
|
||||
|
||||
gfs2_holder_allow_demote(gh);
|
||||
leftover = fault_in_iov_iter_readable(from, window_size);
|
||||
gfs2_holder_disallow_demote(gh);
|
||||
if (leftover != window_size) {
|
||||
if (!gfs2_holder_queued(gh)) {
|
||||
if (read)
|
||||
goto out_uninit;
|
||||
goto retry;
|
||||
}
|
||||
goto retry_under_glock;
|
||||
}
|
||||
}
|
||||
out_unlock:
|
||||
if (gfs2_holder_queued(gh))
|
||||
gfs2_glock_dq(gh);
|
||||
out_uninit:
|
||||
gfs2_holder_uninit(gh);
|
||||
if (statfs_gh)
|
||||
kfree(statfs_gh);
|
||||
return read ? read : ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* gfs2_file_write_iter - Perform a write to a file
|
||||
* @iocb: The io context
|
||||
@@ -928,9 +1141,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
||||
goto out_unlock;
|
||||
|
||||
iocb->ki_flags |= IOCB_DSYNC;
|
||||
current->backing_dev_info = inode_to_bdi(inode);
|
||||
buffered = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
|
||||
current->backing_dev_info = NULL;
|
||||
buffered = gfs2_file_buffered_write(iocb, from, &gh);
|
||||
if (unlikely(buffered <= 0)) {
|
||||
if (!ret)
|
||||
ret = buffered;
|
||||
@@ -944,7 +1155,6 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
||||
* the direct I/O range as we don't know if the buffered pages
|
||||
* made it to disk.
|
||||
*/
|
||||
iocb->ki_pos += buffered;
|
||||
ret2 = generic_write_sync(iocb, buffered);
|
||||
invalidate_mapping_pages(mapping,
|
||||
(iocb->ki_pos - buffered) >> PAGE_SHIFT,
|
||||
@@ -952,13 +1162,9 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
||||
if (!ret || ret2 > 0)
|
||||
ret += ret2;
|
||||
} else {
|
||||
current->backing_dev_info = inode_to_bdi(inode);
|
||||
ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
|
||||
current->backing_dev_info = NULL;
|
||||
if (likely(ret > 0)) {
|
||||
iocb->ki_pos += ret;
|
||||
ret = gfs2_file_buffered_write(iocb, from, &gh);
|
||||
if (likely(ret > 0))
|
||||
ret = generic_write_sync(iocb, ret);
|
||||
}
|
||||
}
|
||||
|
||||
out_unlock:
|
||||
|
||||
328
fs/gfs2/glock.c
328
fs/gfs2/glock.c
@@ -58,6 +58,7 @@ struct gfs2_glock_iter {
|
||||
typedef void (*glock_examiner) (struct gfs2_glock * gl);
|
||||
|
||||
static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
|
||||
static void __gfs2_glock_dq(struct gfs2_holder *gh);
|
||||
|
||||
static struct dentry *gfs2_root;
|
||||
static struct workqueue_struct *glock_workqueue;
|
||||
@@ -197,6 +198,12 @@ static int demote_ok(const struct gfs2_glock *gl)
|
||||
|
||||
if (gl->gl_state == LM_ST_UNLOCKED)
|
||||
return 0;
|
||||
/*
|
||||
* Note that demote_ok is used for the lru process of disposing of
|
||||
* glocks. For this purpose, we don't care if the glock's holders
|
||||
* have the HIF_MAY_DEMOTE flag set or not. If someone is using
|
||||
* them, don't demote.
|
||||
*/
|
||||
if (!list_empty(&gl->gl_holders))
|
||||
return 0;
|
||||
if (glops->go_demote_ok)
|
||||
@@ -301,46 +308,59 @@ void gfs2_glock_put(struct gfs2_glock *gl)
|
||||
}
|
||||
|
||||
/**
|
||||
* may_grant - check if its ok to grant a new lock
|
||||
* may_grant - check if it's ok to grant a new lock
|
||||
* @gl: The glock
|
||||
* @current_gh: One of the current holders of @gl
|
||||
* @gh: The lock request which we wish to grant
|
||||
*
|
||||
* Returns: true if its ok to grant the lock
|
||||
* With our current compatibility rules, if a glock has one or more active
|
||||
* holders (HIF_HOLDER flag set), any of those holders can be passed in as
|
||||
* @current_gh; they are all the same as far as compatibility with the new @gh
|
||||
* goes.
|
||||
*
|
||||
* Returns true if it's ok to grant the lock.
|
||||
*/
|
||||
|
||||
static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh)
|
||||
static inline bool may_grant(struct gfs2_glock *gl,
|
||||
struct gfs2_holder *current_gh,
|
||||
struct gfs2_holder *gh)
|
||||
{
|
||||
const struct gfs2_holder *gh_head = list_first_entry(&gl->gl_holders, const struct gfs2_holder, gh_list);
|
||||
if (current_gh) {
|
||||
GLOCK_BUG_ON(gl, !test_bit(HIF_HOLDER, ¤t_gh->gh_iflags));
|
||||
|
||||
if (gh != gh_head) {
|
||||
/**
|
||||
* Here we make a special exception to grant holders who agree
|
||||
* to share the EX lock with other holders who also have the
|
||||
* bit set. If the original holder has the LM_FLAG_NODE_SCOPE bit
|
||||
* is set, we grant more holders with the bit set.
|
||||
*/
|
||||
if (gh_head->gh_state == LM_ST_EXCLUSIVE &&
|
||||
(gh_head->gh_flags & LM_FLAG_NODE_SCOPE) &&
|
||||
gh->gh_state == LM_ST_EXCLUSIVE &&
|
||||
(gh->gh_flags & LM_FLAG_NODE_SCOPE))
|
||||
return 1;
|
||||
if ((gh->gh_state == LM_ST_EXCLUSIVE ||
|
||||
gh_head->gh_state == LM_ST_EXCLUSIVE))
|
||||
return 0;
|
||||
switch(current_gh->gh_state) {
|
||||
case LM_ST_EXCLUSIVE:
|
||||
/*
|
||||
* Here we make a special exception to grant holders
|
||||
* who agree to share the EX lock with other holders
|
||||
* who also have the bit set. If the original holder
|
||||
* has the LM_FLAG_NODE_SCOPE bit set, we grant more
|
||||
* holders with the bit set.
|
||||
*/
|
||||
return gh->gh_state == LM_ST_EXCLUSIVE &&
|
||||
(current_gh->gh_flags & LM_FLAG_NODE_SCOPE) &&
|
||||
(gh->gh_flags & LM_FLAG_NODE_SCOPE);
|
||||
|
||||
case LM_ST_SHARED:
|
||||
case LM_ST_DEFERRED:
|
||||
return gh->gh_state == current_gh->gh_state;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (gl->gl_state == gh->gh_state)
|
||||
return 1;
|
||||
return true;
|
||||
if (gh->gh_flags & GL_EXACT)
|
||||
return 0;
|
||||
return false;
|
||||
if (gl->gl_state == LM_ST_EXCLUSIVE) {
|
||||
if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == LM_ST_SHARED)
|
||||
return 1;
|
||||
if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == LM_ST_DEFERRED)
|
||||
return 1;
|
||||
return gh->gh_state == LM_ST_SHARED ||
|
||||
gh->gh_state == LM_ST_DEFERRED;
|
||||
}
|
||||
if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY))
|
||||
return 1;
|
||||
return 0;
|
||||
if (gh->gh_flags & LM_FLAG_ANY)
|
||||
return gl->gl_state != LM_ST_UNLOCKED;
|
||||
return false;
|
||||
}
|
||||
|
||||
static void gfs2_holder_wake(struct gfs2_holder *gh)
|
||||
@@ -366,7 +386,7 @@ static void do_error(struct gfs2_glock *gl, const int ret)
|
||||
struct gfs2_holder *gh, *tmp;
|
||||
|
||||
list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
|
||||
if (test_bit(HIF_HOLDER, &gh->gh_iflags))
|
||||
if (!test_bit(HIF_WAIT, &gh->gh_iflags))
|
||||
continue;
|
||||
if (ret & LM_OUT_ERROR)
|
||||
gh->gh_error = -EIO;
|
||||
@@ -380,6 +400,78 @@ static void do_error(struct gfs2_glock *gl, const int ret)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* demote_incompat_holders - demote incompatible demoteable holders
|
||||
* @gl: the glock we want to promote
|
||||
* @new_gh: the new holder to be promoted
|
||||
*/
|
||||
static void demote_incompat_holders(struct gfs2_glock *gl,
|
||||
struct gfs2_holder *new_gh)
|
||||
{
|
||||
struct gfs2_holder *gh;
|
||||
|
||||
/*
|
||||
* Demote incompatible holders before we make ourselves eligible.
|
||||
* (This holder may or may not allow auto-demoting, but we don't want
|
||||
* to demote the new holder before it's even granted.)
|
||||
*/
|
||||
list_for_each_entry(gh, &gl->gl_holders, gh_list) {
|
||||
/*
|
||||
* Since holders are at the front of the list, we stop when we
|
||||
* find the first non-holder.
|
||||
*/
|
||||
if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
|
||||
return;
|
||||
if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags) &&
|
||||
!may_grant(gl, new_gh, gh)) {
|
||||
/*
|
||||
* We should not recurse into do_promote because
|
||||
* __gfs2_glock_dq only calls handle_callback,
|
||||
* gfs2_glock_add_to_lru and __gfs2_glock_queue_work.
|
||||
*/
|
||||
__gfs2_glock_dq(gh);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* find_first_holder - find the first "holder" gh
|
||||
* @gl: the glock
|
||||
*/
|
||||
|
||||
static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
|
||||
{
|
||||
struct gfs2_holder *gh;
|
||||
|
||||
if (!list_empty(&gl->gl_holders)) {
|
||||
gh = list_first_entry(&gl->gl_holders, struct gfs2_holder,
|
||||
gh_list);
|
||||
if (test_bit(HIF_HOLDER, &gh->gh_iflags))
|
||||
return gh;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* find_first_strong_holder - find the first non-demoteable holder
|
||||
* @gl: the glock
|
||||
*
|
||||
* Find the first holder that doesn't have the HIF_MAY_DEMOTE flag set.
|
||||
*/
|
||||
static inline struct gfs2_holder *
|
||||
find_first_strong_holder(struct gfs2_glock *gl)
|
||||
{
|
||||
struct gfs2_holder *gh;
|
||||
|
||||
list_for_each_entry(gh, &gl->gl_holders, gh_list) {
|
||||
if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
|
||||
return NULL;
|
||||
if (!test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))
|
||||
return gh;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* do_promote - promote as many requests as possible on the current queue
|
||||
* @gl: The glock
|
||||
@@ -393,14 +485,21 @@ __releases(&gl->gl_lockref.lock)
|
||||
__acquires(&gl->gl_lockref.lock)
|
||||
{
|
||||
const struct gfs2_glock_operations *glops = gl->gl_ops;
|
||||
struct gfs2_holder *gh, *tmp;
|
||||
struct gfs2_holder *gh, *tmp, *first_gh;
|
||||
bool incompat_holders_demoted = false;
|
||||
int ret;
|
||||
|
||||
restart:
|
||||
first_gh = find_first_strong_holder(gl);
|
||||
list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
|
||||
if (test_bit(HIF_HOLDER, &gh->gh_iflags))
|
||||
if (!test_bit(HIF_WAIT, &gh->gh_iflags))
|
||||
continue;
|
||||
if (may_grant(gl, gh)) {
|
||||
if (may_grant(gl, first_gh, gh)) {
|
||||
if (!incompat_holders_demoted) {
|
||||
demote_incompat_holders(gl, first_gh);
|
||||
incompat_holders_demoted = true;
|
||||
first_gh = gh;
|
||||
}
|
||||
if (gh->gh_list.prev == &gl->gl_holders &&
|
||||
glops->go_lock) {
|
||||
spin_unlock(&gl->gl_lockref.lock);
|
||||
@@ -426,6 +525,11 @@ restart:
|
||||
gfs2_holder_wake(gh);
|
||||
continue;
|
||||
}
|
||||
/*
|
||||
* If we get here, it means we may not grant this holder for
|
||||
* some reason. If this holder is the head of the list, it
|
||||
* means we have a blocked holder at the head, so return 1.
|
||||
*/
|
||||
if (gh->gh_list.prev == &gl->gl_holders)
|
||||
return 1;
|
||||
do_error(gl, 0);
|
||||
@@ -722,23 +826,6 @@ out:
|
||||
spin_lock(&gl->gl_lockref.lock);
|
||||
}
|
||||
|
||||
/**
|
||||
* find_first_holder - find the first "holder" gh
|
||||
* @gl: the glock
|
||||
*/
|
||||
|
||||
static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
|
||||
{
|
||||
struct gfs2_holder *gh;
|
||||
|
||||
if (!list_empty(&gl->gl_holders)) {
|
||||
gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list);
|
||||
if (test_bit(HIF_HOLDER, &gh->gh_iflags))
|
||||
return gh;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* run_queue - do all outstanding tasks related to a glock
|
||||
* @gl: The glock in question
|
||||
@@ -1354,15 +1441,20 @@ __acquires(&gl->gl_lockref.lock)
|
||||
GLOCK_BUG_ON(gl, true);
|
||||
|
||||
if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
|
||||
if (test_bit(GLF_LOCK, &gl->gl_flags))
|
||||
try_futile = !may_grant(gl, gh);
|
||||
if (test_bit(GLF_LOCK, &gl->gl_flags)) {
|
||||
struct gfs2_holder *first_gh;
|
||||
|
||||
first_gh = find_first_strong_holder(gl);
|
||||
try_futile = !may_grant(gl, first_gh, gh);
|
||||
}
|
||||
if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
|
||||
goto fail;
|
||||
}
|
||||
|
||||
list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
|
||||
if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid &&
|
||||
(gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK)))
|
||||
(gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK) &&
|
||||
!test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags)))
|
||||
goto trap_recursive;
|
||||
if (try_futile &&
|
||||
!(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
|
||||
@@ -1458,51 +1550,83 @@ int gfs2_glock_poll(struct gfs2_holder *gh)
|
||||
return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
|
||||
* @gh: the glock holder
|
||||
*
|
||||
*/
|
||||
static inline bool needs_demote(struct gfs2_glock *gl)
|
||||
{
|
||||
return (test_bit(GLF_DEMOTE, &gl->gl_flags) ||
|
||||
test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags));
|
||||
}
|
||||
|
||||
void gfs2_glock_dq(struct gfs2_holder *gh)
|
||||
static void __gfs2_glock_dq(struct gfs2_holder *gh)
|
||||
{
|
||||
struct gfs2_glock *gl = gh->gh_gl;
|
||||
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
|
||||
unsigned delay = 0;
|
||||
int fast_path = 0;
|
||||
|
||||
spin_lock(&gl->gl_lockref.lock);
|
||||
/*
|
||||
* If we're in the process of file system withdraw, we cannot just
|
||||
* dequeue any glocks until our journal is recovered, lest we
|
||||
* introduce file system corruption. We need two exceptions to this
|
||||
* rule: We need to allow unlocking of nondisk glocks and the glock
|
||||
* for our own journal that needs recovery.
|
||||
* This while loop is similar to function demote_incompat_holders:
|
||||
* If the glock is due to be demoted (which may be from another node
|
||||
* or even if this holder is GL_NOCACHE), the weak holders are
|
||||
* demoted as well, allowing the glock to be demoted.
|
||||
*/
|
||||
if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
|
||||
glock_blocked_by_withdraw(gl) &&
|
||||
gh->gh_gl != sdp->sd_jinode_gl) {
|
||||
sdp->sd_glock_dqs_held++;
|
||||
spin_unlock(&gl->gl_lockref.lock);
|
||||
might_sleep();
|
||||
wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
|
||||
TASK_UNINTERRUPTIBLE);
|
||||
spin_lock(&gl->gl_lockref.lock);
|
||||
}
|
||||
if (gh->gh_flags & GL_NOCACHE)
|
||||
handle_callback(gl, LM_ST_UNLOCKED, 0, false);
|
||||
while (gh) {
|
||||
/*
|
||||
* If we're in the process of file system withdraw, we cannot
|
||||
* just dequeue any glocks until our journal is recovered, lest
|
||||
* we introduce file system corruption. We need two exceptions
|
||||
* to this rule: We need to allow unlocking of nondisk glocks
|
||||
* and the glock for our own journal that needs recovery.
|
||||
*/
|
||||
if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
|
||||
glock_blocked_by_withdraw(gl) &&
|
||||
gh->gh_gl != sdp->sd_jinode_gl) {
|
||||
sdp->sd_glock_dqs_held++;
|
||||
spin_unlock(&gl->gl_lockref.lock);
|
||||
might_sleep();
|
||||
wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
|
||||
TASK_UNINTERRUPTIBLE);
|
||||
spin_lock(&gl->gl_lockref.lock);
|
||||
}
|
||||
|
||||
list_del_init(&gh->gh_list);
|
||||
clear_bit(HIF_HOLDER, &gh->gh_iflags);
|
||||
if (list_empty(&gl->gl_holders) &&
|
||||
!test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
|
||||
!test_bit(GLF_DEMOTE, &gl->gl_flags))
|
||||
fast_path = 1;
|
||||
/*
|
||||
* This holder should not be cached, so mark it for demote.
|
||||
* Note: this should be done before the check for needs_demote
|
||||
* below.
|
||||
*/
|
||||
if (gh->gh_flags & GL_NOCACHE)
|
||||
handle_callback(gl, LM_ST_UNLOCKED, 0, false);
|
||||
|
||||
list_del_init(&gh->gh_list);
|
||||
clear_bit(HIF_HOLDER, &gh->gh_iflags);
|
||||
trace_gfs2_glock_queue(gh, 0);
|
||||
|
||||
/*
|
||||
* If there hasn't been a demote request we are done.
|
||||
* (Let the remaining holders, if any, keep holding it.)
|
||||
*/
|
||||
if (!needs_demote(gl)) {
|
||||
if (list_empty(&gl->gl_holders))
|
||||
fast_path = 1;
|
||||
break;
|
||||
}
|
||||
/*
|
||||
* If we have another strong holder (we cannot auto-demote)
|
||||
* we are done. It keeps holding it until it is done.
|
||||
*/
|
||||
if (find_first_strong_holder(gl))
|
||||
break;
|
||||
|
||||
/*
|
||||
* If we have a weak holder at the head of the list, it
|
||||
* (and all others like it) must be auto-demoted. If there
|
||||
* are no more weak holders, we exit the while loop.
|
||||
*/
|
||||
gh = find_first_holder(gl);
|
||||
}
|
||||
|
||||
if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl))
|
||||
gfs2_glock_add_to_lru(gl);
|
||||
|
||||
trace_gfs2_glock_queue(gh, 0);
|
||||
if (unlikely(!fast_path)) {
|
||||
gl->gl_lockref.count++;
|
||||
if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
|
||||
@@ -1511,6 +1635,19 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
|
||||
delay = gl->gl_hold_time;
|
||||
__gfs2_glock_queue_work(gl, delay);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
|
||||
* @gh: the glock holder
|
||||
*
|
||||
*/
|
||||
void gfs2_glock_dq(struct gfs2_holder *gh)
|
||||
{
|
||||
struct gfs2_glock *gl = gh->gh_gl;
|
||||
|
||||
spin_lock(&gl->gl_lockref.lock);
|
||||
__gfs2_glock_dq(gh);
|
||||
spin_unlock(&gl->gl_lockref.lock);
|
||||
}
|
||||
|
||||
@@ -1673,6 +1810,7 @@ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
|
||||
|
||||
void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
|
||||
{
|
||||
struct gfs2_holder mock_gh = { .gh_gl = gl, .gh_state = state, };
|
||||
unsigned long delay = 0;
|
||||
unsigned long holdtime;
|
||||
unsigned long now = jiffies;
|
||||
@@ -1687,6 +1825,28 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
|
||||
if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
|
||||
delay = gl->gl_hold_time;
|
||||
}
|
||||
/*
|
||||
* Note 1: We cannot call demote_incompat_holders from handle_callback
|
||||
* or gfs2_set_demote due to recursion problems like: gfs2_glock_dq ->
|
||||
* handle_callback -> demote_incompat_holders -> gfs2_glock_dq
|
||||
* Plus, we only want to demote the holders if the request comes from
|
||||
* a remote cluster node because local holder conflicts are resolved
|
||||
* elsewhere.
|
||||
*
|
||||
* Note 2: if a remote node wants this glock in EX mode, lock_dlm will
|
||||
* request that we set our state to UNLOCKED. Here we mock up a holder
|
||||
* to make it look like someone wants the lock EX locally. Any SH
|
||||
* and DF requests should be able to share the lock without demoting.
|
||||
*
|
||||
* Note 3: We only want to demote the demoteable holders when there
|
||||
* are no more strong holders. The demoteable holders might as well
|
||||
* keep the glock until the last strong holder is done with it.
|
||||
*/
|
||||
if (!find_first_strong_holder(gl)) {
|
||||
if (state == LM_ST_UNLOCKED)
|
||||
mock_gh.gh_state = LM_ST_EXCLUSIVE;
|
||||
demote_incompat_holders(gl, &mock_gh);
|
||||
}
|
||||
handle_callback(gl, state, delay, true);
|
||||
__gfs2_glock_queue_work(gl, delay);
|
||||
spin_unlock(&gl->gl_lockref.lock);
|
||||
@@ -2078,6 +2238,8 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags)
|
||||
*p++ = 'H';
|
||||
if (test_bit(HIF_WAIT, &iflags))
|
||||
*p++ = 'W';
|
||||
if (test_bit(HIF_MAY_DEMOTE, &iflags))
|
||||
*p++ = 'D';
|
||||
*p = 0;
|
||||
return buf;
|
||||
}
|
||||
|
||||
@@ -150,6 +150,8 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
|
||||
list_for_each_entry(gh, &gl->gl_holders, gh_list) {
|
||||
if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
|
||||
break;
|
||||
if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))
|
||||
continue;
|
||||
if (gh->gh_owner_pid == pid)
|
||||
goto out;
|
||||
}
|
||||
@@ -325,6 +327,24 @@ static inline void glock_clear_object(struct gfs2_glock *gl, void *object)
|
||||
spin_unlock(&gl->gl_lockref.lock);
|
||||
}
|
||||
|
||||
static inline void gfs2_holder_allow_demote(struct gfs2_holder *gh)
|
||||
{
|
||||
struct gfs2_glock *gl = gh->gh_gl;
|
||||
|
||||
spin_lock(&gl->gl_lockref.lock);
|
||||
set_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);
|
||||
spin_unlock(&gl->gl_lockref.lock);
|
||||
}
|
||||
|
||||
static inline void gfs2_holder_disallow_demote(struct gfs2_holder *gh)
|
||||
{
|
||||
struct gfs2_glock *gl = gh->gh_gl;
|
||||
|
||||
spin_lock(&gl->gl_lockref.lock);
|
||||
clear_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);
|
||||
spin_unlock(&gl->gl_lockref.lock);
|
||||
}
|
||||
|
||||
extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation);
|
||||
extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation);
|
||||
|
||||
|
||||
@@ -252,6 +252,7 @@ struct gfs2_lkstats {
|
||||
|
||||
enum {
|
||||
/* States */
|
||||
HIF_MAY_DEMOTE = 1,
|
||||
HIF_HOLDER = 6, /* Set for gh that "holds" the glock */
|
||||
HIF_WAIT = 10,
|
||||
};
|
||||
@@ -386,9 +387,8 @@ struct gfs2_inode {
|
||||
u64 i_generation;
|
||||
u64 i_eattr;
|
||||
unsigned long i_flags; /* GIF_... */
|
||||
struct gfs2_glock *i_gl; /* Move into i_gh? */
|
||||
struct gfs2_glock *i_gl;
|
||||
struct gfs2_holder i_iopen_gh;
|
||||
struct gfs2_holder i_gh; /* for prepare/commit_write only */
|
||||
struct gfs2_qadata *i_qadata; /* quota allocation data */
|
||||
struct gfs2_holder i_rgd_gh;
|
||||
struct gfs2_blkreserv i_res; /* rgrp multi-block reservation */
|
||||
|
||||
@@ -757,7 +757,7 @@ again:
|
||||
* same page as we're writing to, without it being marked
|
||||
* up-to-date.
|
||||
*/
|
||||
if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
|
||||
if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
|
||||
status = -EFAULT;
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -381,6 +381,8 @@ static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter,
|
||||
loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter);
|
||||
|
||||
dio->size += length;
|
||||
if (!length)
|
||||
return -EFAULT;
|
||||
return length;
|
||||
}
|
||||
|
||||
@@ -412,6 +414,8 @@ static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi,
|
||||
copied = copy_to_iter(inline_data, length, iter);
|
||||
}
|
||||
dio->size += copied;
|
||||
if (!copied)
|
||||
return -EFAULT;
|
||||
return copied;
|
||||
}
|
||||
|
||||
@@ -596,6 +600,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
|
||||
if (iov_iter_rw(iter) == READ && iomi.pos >= dio->i_size)
|
||||
iov_iter_revert(iter, iomi.pos - dio->i_size);
|
||||
|
||||
if (ret == -EFAULT && dio->size && (dio_flags & IOMAP_DIO_PARTIAL)) {
|
||||
if (!(iocb->ki_flags & IOCB_NOWAIT))
|
||||
wait_for_completion = true;
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
/* magic error code to fall back to buffered I/O */
|
||||
if (ret == -ENOTBLK) {
|
||||
wait_for_completion = true;
|
||||
|
||||
@@ -1829,7 +1829,7 @@ again:
|
||||
* pages being swapped out between us bringing them into memory
|
||||
* and doing the actual copying.
|
||||
*/
|
||||
if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
|
||||
if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
|
||||
status = -EFAULT;
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -989,7 +989,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
|
||||
frame_vbo = pos & ~(frame_size - 1);
|
||||
index = frame_vbo >> PAGE_SHIFT;
|
||||
|
||||
if (unlikely(iov_iter_fault_in_readable(from, bytes))) {
|
||||
if (unlikely(fault_in_iov_iter_readable(from, bytes))) {
|
||||
err = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
@@ -299,6 +299,34 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
|
||||
|
||||
extern const struct bpf_map_ops bpf_map_offload_ops;
|
||||
|
||||
/* bpf_type_flag contains a set of flags that are applicable to the values of
|
||||
* arg_type, ret_type and reg_type. For example, a pointer value may be null,
|
||||
* or a memory is read-only. We classify types into two categories: base types
|
||||
* and extended types. Extended types are base types combined with a type flag.
|
||||
*
|
||||
* Currently there are no more than 32 base types in arg_type, ret_type and
|
||||
* reg_types.
|
||||
*/
|
||||
#define BPF_BASE_TYPE_BITS 8
|
||||
|
||||
enum bpf_type_flag {
|
||||
/* PTR may be NULL. */
|
||||
PTR_MAYBE_NULL = BIT(0 + BPF_BASE_TYPE_BITS),
|
||||
|
||||
/* MEM is read-only. When applied on bpf_arg, it indicates the arg is
|
||||
* compatible with both mutable and immutable memory.
|
||||
*/
|
||||
MEM_RDONLY = BIT(1 + BPF_BASE_TYPE_BITS),
|
||||
|
||||
__BPF_TYPE_LAST_FLAG = MEM_RDONLY,
|
||||
};
|
||||
|
||||
/* Max number of base types. */
|
||||
#define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS)
|
||||
|
||||
/* Max number of all types. */
|
||||
#define BPF_TYPE_LIMIT (__BPF_TYPE_LAST_FLAG | (__BPF_TYPE_LAST_FLAG - 1))
|
||||
|
||||
/* function argument constraints */
|
||||
enum bpf_arg_type {
|
||||
ARG_DONTCARE = 0, /* unused argument in helper function */
|
||||
@@ -310,13 +338,11 @@ enum bpf_arg_type {
|
||||
ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */
|
||||
ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */
|
||||
ARG_PTR_TO_UNINIT_MAP_VALUE, /* pointer to valid memory used to store a map value */
|
||||
ARG_PTR_TO_MAP_VALUE_OR_NULL, /* pointer to stack used as map value or NULL */
|
||||
|
||||
/* the following constraints used to prototype bpf_memcmp() and other
|
||||
* functions that access data on eBPF program stack
|
||||
*/
|
||||
ARG_PTR_TO_MEM, /* pointer to valid memory (stack, packet, map value) */
|
||||
ARG_PTR_TO_MEM_OR_NULL, /* pointer to valid memory or NULL */
|
||||
ARG_PTR_TO_UNINIT_MEM, /* pointer to memory does not need to be initialized,
|
||||
* helper function must fill all bytes or clear
|
||||
* them in error case.
|
||||
@@ -326,42 +352,65 @@ enum bpf_arg_type {
|
||||
ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */
|
||||
|
||||
ARG_PTR_TO_CTX, /* pointer to context */
|
||||
ARG_PTR_TO_CTX_OR_NULL, /* pointer to context or NULL */
|
||||
ARG_ANYTHING, /* any (initialized) argument is ok */
|
||||
ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */
|
||||
ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */
|
||||
ARG_PTR_TO_INT, /* pointer to int */
|
||||
ARG_PTR_TO_LONG, /* pointer to long */
|
||||
ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */
|
||||
ARG_PTR_TO_SOCKET_OR_NULL, /* pointer to bpf_sock (fullsock) or NULL */
|
||||
ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */
|
||||
ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */
|
||||
ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */
|
||||
ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */
|
||||
ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */
|
||||
ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */
|
||||
ARG_PTR_TO_FUNC, /* pointer to a bpf program function */
|
||||
ARG_PTR_TO_STACK_OR_NULL, /* pointer to stack or NULL */
|
||||
ARG_PTR_TO_STACK, /* pointer to stack */
|
||||
ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */
|
||||
ARG_PTR_TO_TIMER, /* pointer to bpf_timer */
|
||||
__BPF_ARG_TYPE_MAX,
|
||||
|
||||
/* Extended arg_types. */
|
||||
ARG_PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MAP_VALUE,
|
||||
ARG_PTR_TO_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MEM,
|
||||
ARG_PTR_TO_CTX_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_CTX,
|
||||
ARG_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET,
|
||||
ARG_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_ALLOC_MEM,
|
||||
ARG_PTR_TO_STACK_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_STACK,
|
||||
|
||||
/* This must be the last entry. Its purpose is to ensure the enum is
|
||||
* wide enough to hold the higher bits reserved for bpf_type_flag.
|
||||
*/
|
||||
__BPF_ARG_TYPE_LIMIT = BPF_TYPE_LIMIT,
|
||||
};
|
||||
static_assert(__BPF_ARG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);
|
||||
|
||||
/* type of values returned from helper functions */
|
||||
enum bpf_return_type {
|
||||
RET_INTEGER, /* function returns integer */
|
||||
RET_VOID, /* function doesn't return anything */
|
||||
RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */
|
||||
RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */
|
||||
RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */
|
||||
RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */
|
||||
RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */
|
||||
RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */
|
||||
RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */
|
||||
RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */
|
||||
RET_PTR_TO_SOCKET, /* returns a pointer to a socket */
|
||||
RET_PTR_TO_TCP_SOCK, /* returns a pointer to a tcp_sock */
|
||||
RET_PTR_TO_SOCK_COMMON, /* returns a pointer to a sock_common */
|
||||
RET_PTR_TO_ALLOC_MEM, /* returns a pointer to dynamically allocated memory */
|
||||
RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */
|
||||
RET_PTR_TO_BTF_ID, /* returns a pointer to a btf_id */
|
||||
__BPF_RET_TYPE_MAX,
|
||||
|
||||
/* Extended ret_types. */
|
||||
RET_PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_MAP_VALUE,
|
||||
RET_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET,
|
||||
RET_PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK,
|
||||
RET_PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON,
|
||||
RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM,
|
||||
RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID,
|
||||
|
||||
/* This must be the last entry. Its purpose is to ensure the enum is
|
||||
* wide enough to hold the higher bits reserved for bpf_type_flag.
|
||||
*/
|
||||
__BPF_RET_TYPE_LIMIT = BPF_TYPE_LIMIT,
|
||||
};
|
||||
static_assert(__BPF_RET_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);
|
||||
|
||||
/* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
|
||||
* to in-kernel helper functions and for adjusting imm32 field in BPF_CALL
|
||||
@@ -423,18 +472,15 @@ enum bpf_reg_type {
|
||||
PTR_TO_CTX, /* reg points to bpf_context */
|
||||
CONST_PTR_TO_MAP, /* reg points to struct bpf_map */
|
||||
PTR_TO_MAP_VALUE, /* reg points to map element value */
|
||||
PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
|
||||
PTR_TO_MAP_KEY, /* reg points to a map element key */
|
||||
PTR_TO_STACK, /* reg == frame_pointer + offset */
|
||||
PTR_TO_PACKET_META, /* skb->data - meta_len */
|
||||
PTR_TO_PACKET, /* reg points to skb->data */
|
||||
PTR_TO_PACKET_END, /* skb->data + headlen */
|
||||
PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */
|
||||
PTR_TO_SOCKET, /* reg points to struct bpf_sock */
|
||||
PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */
|
||||
PTR_TO_SOCK_COMMON, /* reg points to sock_common */
|
||||
PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */
|
||||
PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */
|
||||
PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */
|
||||
PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */
|
||||
PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */
|
||||
/* PTR_TO_BTF_ID points to a kernel struct that does not need
|
||||
@@ -452,18 +498,25 @@ enum bpf_reg_type {
|
||||
* been checked for null. Used primarily to inform the verifier
|
||||
* an explicit null check is required for this struct.
|
||||
*/
|
||||
PTR_TO_BTF_ID_OR_NULL,
|
||||
PTR_TO_MEM, /* reg points to valid memory region */
|
||||
PTR_TO_MEM_OR_NULL, /* reg points to valid memory region or NULL */
|
||||
PTR_TO_RDONLY_BUF, /* reg points to a readonly buffer */
|
||||
PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */
|
||||
PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */
|
||||
PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */
|
||||
PTR_TO_BUF, /* reg points to a read/write buffer */
|
||||
PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */
|
||||
PTR_TO_FUNC, /* reg points to a bpf program function */
|
||||
PTR_TO_MAP_KEY, /* reg points to a map element key */
|
||||
__BPF_REG_TYPE_MAX,
|
||||
|
||||
/* Extended reg_types. */
|
||||
PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | PTR_TO_MAP_VALUE,
|
||||
PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCKET,
|
||||
PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON,
|
||||
PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | PTR_TO_TCP_SOCK,
|
||||
PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | PTR_TO_BTF_ID,
|
||||
|
||||
/* This must be the last entry. Its purpose is to ensure the enum is
|
||||
* wide enough to hold the higher bits reserved for bpf_type_flag.
|
||||
*/
|
||||
__BPF_REG_TYPE_LIMIT = BPF_TYPE_LIMIT,
|
||||
};
|
||||
static_assert(__BPF_REG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);
|
||||
|
||||
/* The information passed from prog-specific *_is_valid_access
|
||||
* back to the verifier.
|
||||
|
||||
@@ -19,6 +19,8 @@
|
||||
* that converting umax_value to int cannot overflow.
|
||||
*/
|
||||
#define BPF_MAX_VAR_SIZ (1 << 29)
|
||||
/* size of type_str_buf in bpf_verifier. */
|
||||
#define TYPE_STR_BUF_LEN 64
|
||||
|
||||
/* Liveness marks, used for registers and spilled-regs (in stack slots).
|
||||
* Read marks propagate upwards until they find a write mark; they record that
|
||||
@@ -478,6 +480,9 @@ struct bpf_verifier_env {
|
||||
u32 longest_mark_read_walk;
|
||||
bpfptr_t fd_array;
|
||||
|
||||
/* buffer used in reg_type_str() to generate reg_type string */
|
||||
char type_str_buf[TYPE_STR_BUF_LEN];
|
||||
|
||||
ANDROID_KABI_RESERVE(1);
|
||||
ANDROID_KABI_RESERVE(2);
|
||||
};
|
||||
@@ -541,4 +546,18 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
|
||||
u32 btf_id,
|
||||
struct bpf_attach_target_info *tgt_info);
|
||||
|
||||
#define BPF_BASE_TYPE_MASK GENMASK(BPF_BASE_TYPE_BITS - 1, 0)
|
||||
|
||||
/* extract base type from bpf_{arg, return, reg}_type. */
|
||||
static inline u32 base_type(u32 type)
|
||||
{
|
||||
return type & BPF_BASE_TYPE_MASK;
|
||||
}
|
||||
|
||||
/* extract flags from an extended type. See bpf_type_flag in bpf.h. */
|
||||
static inline u32 type_flag(u32 type)
|
||||
{
|
||||
return type & ~BPF_BASE_TYPE_MASK;
|
||||
}
|
||||
|
||||
#endif /* _LINUX_BPF_VERIFIER_H */
|
||||
|
||||
@@ -332,6 +332,13 @@ struct iomap_dio_ops {
|
||||
*/
|
||||
#define IOMAP_DIO_OVERWRITE_ONLY (1 << 1)
|
||||
|
||||
/*
|
||||
* When a page fault occurs, return a partial synchronous result and allow
|
||||
* the caller to retry the rest of the operation after dealing with the page
|
||||
* fault.
|
||||
*/
|
||||
#define IOMAP_DIO_PARTIAL (1 << 2)
|
||||
|
||||
ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
|
||||
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
|
||||
unsigned int dio_flags, size_t done_before);
|
||||
|
||||
@@ -2903,7 +2903,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
|
||||
#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */
|
||||
#define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO
|
||||
* and return without waiting upon it */
|
||||
#define FOLL_POPULATE 0x40 /* fault in page */
|
||||
#define FOLL_POPULATE 0x40 /* fault in pages (with FOLL_MLOCK) */
|
||||
#define FOLL_NOFAULT 0x80 /* do not fault in pages */
|
||||
#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
|
||||
#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
|
||||
#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
|
||||
|
||||
@@ -731,61 +731,11 @@ int wait_on_page_private_2_killable(struct page *page);
|
||||
extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter);
|
||||
|
||||
/*
|
||||
* Fault everything in given userspace address range in.
|
||||
* Fault in userspace address range.
|
||||
*/
|
||||
static inline int fault_in_pages_writeable(char __user *uaddr, size_t size)
|
||||
{
|
||||
char __user *end = uaddr + size - 1;
|
||||
|
||||
if (unlikely(size == 0))
|
||||
return 0;
|
||||
|
||||
if (unlikely(uaddr > end))
|
||||
return -EFAULT;
|
||||
/*
|
||||
* Writing zeroes into userspace here is OK, because we know that if
|
||||
* the zero gets there, we'll be overwriting it.
|
||||
*/
|
||||
do {
|
||||
if (unlikely(__put_user(0, uaddr) != 0))
|
||||
return -EFAULT;
|
||||
uaddr += PAGE_SIZE;
|
||||
} while (uaddr <= end);
|
||||
|
||||
/* Check whether the range spilled into the next page. */
|
||||
if (((unsigned long)uaddr & PAGE_MASK) ==
|
||||
((unsigned long)end & PAGE_MASK))
|
||||
return __put_user(0, end);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int fault_in_pages_readable(const char __user *uaddr, size_t size)
|
||||
{
|
||||
volatile char c;
|
||||
const char __user *end = uaddr + size - 1;
|
||||
|
||||
if (unlikely(size == 0))
|
||||
return 0;
|
||||
|
||||
if (unlikely(uaddr > end))
|
||||
return -EFAULT;
|
||||
|
||||
do {
|
||||
if (unlikely(__get_user(c, uaddr) != 0))
|
||||
return -EFAULT;
|
||||
uaddr += PAGE_SIZE;
|
||||
} while (uaddr <= end);
|
||||
|
||||
/* Check whether the range spilled into the next page. */
|
||||
if (((unsigned long)uaddr & PAGE_MASK) ==
|
||||
((unsigned long)end & PAGE_MASK)) {
|
||||
return __get_user(c, end);
|
||||
}
|
||||
|
||||
(void)c;
|
||||
return 0;
|
||||
}
|
||||
size_t fault_in_writeable(char __user *uaddr, size_t size);
|
||||
size_t fault_in_safe_writeable(const char __user *uaddr, size_t size);
|
||||
size_t fault_in_readable(const char __user *uaddr, size_t size);
|
||||
|
||||
int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
|
||||
pgoff_t index, gfp_t gfp_mask);
|
||||
|
||||
@@ -35,6 +35,7 @@ struct iov_iter_state {
|
||||
|
||||
struct iov_iter {
|
||||
u8 iter_type;
|
||||
bool nofault;
|
||||
bool data_source;
|
||||
size_t iov_offset;
|
||||
size_t count;
|
||||
@@ -133,7 +134,8 @@ size_t copy_page_from_iter_atomic(struct page *page, unsigned offset,
|
||||
size_t bytes, struct iov_iter *i);
|
||||
void iov_iter_advance(struct iov_iter *i, size_t bytes);
|
||||
void iov_iter_revert(struct iov_iter *i, size_t bytes);
|
||||
int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes);
|
||||
size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes);
|
||||
size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t bytes);
|
||||
size_t iov_iter_single_seg_count(const struct iov_iter *i);
|
||||
size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
|
||||
struct iov_iter *i);
|
||||
|
||||
@@ -4800,10 +4800,12 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
|
||||
/* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */
|
||||
for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
|
||||
const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
|
||||
u32 type, flag;
|
||||
|
||||
if (ctx_arg_info->offset == off &&
|
||||
(ctx_arg_info->reg_type == PTR_TO_RDONLY_BUF_OR_NULL ||
|
||||
ctx_arg_info->reg_type == PTR_TO_RDWR_BUF_OR_NULL)) {
|
||||
type = base_type(ctx_arg_info->reg_type);
|
||||
flag = type_flag(ctx_arg_info->reg_type);
|
||||
if (ctx_arg_info->offset == off && type == PTR_TO_BUF &&
|
||||
(flag & PTR_MAYBE_NULL)) {
|
||||
info->reg_type = ctx_arg_info->reg_type;
|
||||
return true;
|
||||
}
|
||||
@@ -5508,9 +5510,9 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
|
||||
if (reg->type == PTR_TO_BTF_ID) {
|
||||
reg_btf = reg->btf;
|
||||
reg_ref_id = reg->btf_id;
|
||||
} else if (reg2btf_ids[reg->type]) {
|
||||
} else if (reg2btf_ids[base_type(reg->type)]) {
|
||||
reg_btf = btf_vmlinux;
|
||||
reg_ref_id = *reg2btf_ids[reg->type];
|
||||
reg_ref_id = *reg2btf_ids[base_type(reg->type)];
|
||||
} else {
|
||||
bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d is not a pointer to btf_id\n",
|
||||
func_name, i,
|
||||
@@ -5717,7 +5719,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
reg->type = PTR_TO_MEM_OR_NULL;
|
||||
reg->type = PTR_TO_MEM | PTR_MAYBE_NULL;
|
||||
reg->id = ++env->id_gen;
|
||||
|
||||
continue;
|
||||
@@ -6230,7 +6232,7 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = {
|
||||
.func = bpf_btf_find_by_name_kind,
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_MEM,
|
||||
.arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg2_type = ARG_CONST_SIZE,
|
||||
.arg3_type = ARG_ANYTHING,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
|
||||
@@ -1753,7 +1753,7 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE,
|
||||
};
|
||||
|
||||
|
||||
@@ -530,7 +530,7 @@ const struct bpf_func_proto bpf_strtol_proto = {
|
||||
.func = bpf_strtol,
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_MEM,
|
||||
.arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg2_type = ARG_CONST_SIZE,
|
||||
.arg3_type = ARG_ANYTHING,
|
||||
.arg4_type = ARG_PTR_TO_LONG,
|
||||
@@ -558,7 +558,7 @@ const struct bpf_func_proto bpf_strtoul_proto = {
|
||||
.func = bpf_strtoul,
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_MEM,
|
||||
.arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg2_type = ARG_CONST_SIZE,
|
||||
.arg3_type = ARG_ANYTHING,
|
||||
.arg4_type = ARG_PTR_TO_LONG,
|
||||
@@ -630,7 +630,7 @@ const struct bpf_func_proto bpf_event_output_data_proto = {
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_CONST_MAP_PTR,
|
||||
.arg3_type = ARG_ANYTHING,
|
||||
.arg4_type = ARG_PTR_TO_MEM,
|
||||
.arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
};
|
||||
|
||||
@@ -667,7 +667,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
|
||||
const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
|
||||
.func = bpf_per_cpu_ptr,
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL,
|
||||
.ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY,
|
||||
.arg1_type = ARG_PTR_TO_PERCPU_BTF_ID,
|
||||
.arg2_type = ARG_ANYTHING,
|
||||
};
|
||||
@@ -680,7 +680,7 @@ BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr)
|
||||
const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
|
||||
.func = bpf_this_cpu_ptr,
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_PTR_TO_MEM_OR_BTF_ID,
|
||||
.ret_type = RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY,
|
||||
.arg1_type = ARG_PTR_TO_PERCPU_BTF_ID,
|
||||
};
|
||||
|
||||
@@ -1013,7 +1013,7 @@ const struct bpf_func_proto bpf_snprintf_proto = {
|
||||
.arg1_type = ARG_PTR_TO_MEM_OR_NULL,
|
||||
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
.arg3_type = ARG_PTR_TO_CONST_STR,
|
||||
.arg4_type = ARG_PTR_TO_MEM_OR_NULL,
|
||||
.arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
|
||||
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
};
|
||||
|
||||
|
||||
@@ -174,9 +174,9 @@ static const struct bpf_iter_reg bpf_map_elem_reg_info = {
|
||||
.ctx_arg_info_size = 2,
|
||||
.ctx_arg_info = {
|
||||
{ offsetof(struct bpf_iter__bpf_map_elem, key),
|
||||
PTR_TO_RDONLY_BUF_OR_NULL },
|
||||
PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY },
|
||||
{ offsetof(struct bpf_iter__bpf_map_elem, value),
|
||||
PTR_TO_RDWR_BUF_OR_NULL },
|
||||
PTR_TO_BUF | PTR_MAYBE_NULL },
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
@@ -444,7 +444,7 @@ const struct bpf_func_proto bpf_ringbuf_output_proto = {
|
||||
.func = bpf_ringbuf_output,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_CONST_MAP_PTR,
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
};
|
||||
|
||||
@@ -4757,7 +4757,7 @@ static const struct bpf_func_proto bpf_sys_bpf_proto = {
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_ANYTHING,
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE,
|
||||
};
|
||||
|
||||
|
||||
@@ -445,18 +445,6 @@ static bool reg_type_not_null(enum bpf_reg_type type)
|
||||
type == PTR_TO_SOCK_COMMON;
|
||||
}
|
||||
|
||||
static bool reg_type_may_be_null(enum bpf_reg_type type)
|
||||
{
|
||||
return type == PTR_TO_MAP_VALUE_OR_NULL ||
|
||||
type == PTR_TO_SOCKET_OR_NULL ||
|
||||
type == PTR_TO_SOCK_COMMON_OR_NULL ||
|
||||
type == PTR_TO_TCP_SOCK_OR_NULL ||
|
||||
type == PTR_TO_BTF_ID_OR_NULL ||
|
||||
type == PTR_TO_MEM_OR_NULL ||
|
||||
type == PTR_TO_RDONLY_BUF_OR_NULL ||
|
||||
type == PTR_TO_RDWR_BUF_OR_NULL;
|
||||
}
|
||||
|
||||
static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
|
||||
{
|
||||
return reg->type == PTR_TO_MAP_VALUE &&
|
||||
@@ -465,12 +453,14 @@ static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
|
||||
|
||||
static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
|
||||
{
|
||||
return type == PTR_TO_SOCKET ||
|
||||
type == PTR_TO_SOCKET_OR_NULL ||
|
||||
type == PTR_TO_TCP_SOCK ||
|
||||
type == PTR_TO_TCP_SOCK_OR_NULL ||
|
||||
type == PTR_TO_MEM ||
|
||||
type == PTR_TO_MEM_OR_NULL;
|
||||
return base_type(type) == PTR_TO_SOCKET ||
|
||||
base_type(type) == PTR_TO_TCP_SOCK ||
|
||||
base_type(type) == PTR_TO_MEM;
|
||||
}
|
||||
|
||||
static bool type_is_rdonly_mem(u32 type)
|
||||
{
|
||||
return type & MEM_RDONLY;
|
||||
}
|
||||
|
||||
static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
|
||||
@@ -478,14 +468,9 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
|
||||
return type == ARG_PTR_TO_SOCK_COMMON;
|
||||
}
|
||||
|
||||
static bool arg_type_may_be_null(enum bpf_arg_type type)
|
||||
static bool type_may_be_null(u32 type)
|
||||
{
|
||||
return type == ARG_PTR_TO_MAP_VALUE_OR_NULL ||
|
||||
type == ARG_PTR_TO_MEM_OR_NULL ||
|
||||
type == ARG_PTR_TO_CTX_OR_NULL ||
|
||||
type == ARG_PTR_TO_SOCKET_OR_NULL ||
|
||||
type == ARG_PTR_TO_ALLOC_MEM_OR_NULL ||
|
||||
type == ARG_PTR_TO_STACK_OR_NULL;
|
||||
return type & PTR_MAYBE_NULL;
|
||||
}
|
||||
|
||||
/* Determine whether the function releases some resources allocated by another
|
||||
@@ -545,39 +530,54 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn)
|
||||
insn->imm == BPF_CMPXCHG;
|
||||
}
|
||||
|
||||
/* string representation of 'enum bpf_reg_type' */
|
||||
static const char * const reg_type_str[] = {
|
||||
[NOT_INIT] = "?",
|
||||
[SCALAR_VALUE] = "inv",
|
||||
[PTR_TO_CTX] = "ctx",
|
||||
[CONST_PTR_TO_MAP] = "map_ptr",
|
||||
[PTR_TO_MAP_VALUE] = "map_value",
|
||||
[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
|
||||
[PTR_TO_STACK] = "fp",
|
||||
[PTR_TO_PACKET] = "pkt",
|
||||
[PTR_TO_PACKET_META] = "pkt_meta",
|
||||
[PTR_TO_PACKET_END] = "pkt_end",
|
||||
[PTR_TO_FLOW_KEYS] = "flow_keys",
|
||||
[PTR_TO_SOCKET] = "sock",
|
||||
[PTR_TO_SOCKET_OR_NULL] = "sock_or_null",
|
||||
[PTR_TO_SOCK_COMMON] = "sock_common",
|
||||
[PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null",
|
||||
[PTR_TO_TCP_SOCK] = "tcp_sock",
|
||||
[PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",
|
||||
[PTR_TO_TP_BUFFER] = "tp_buffer",
|
||||
[PTR_TO_XDP_SOCK] = "xdp_sock",
|
||||
[PTR_TO_BTF_ID] = "ptr_",
|
||||
[PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_",
|
||||
[PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_",
|
||||
[PTR_TO_MEM] = "mem",
|
||||
[PTR_TO_MEM_OR_NULL] = "mem_or_null",
|
||||
[PTR_TO_RDONLY_BUF] = "rdonly_buf",
|
||||
[PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null",
|
||||
[PTR_TO_RDWR_BUF] = "rdwr_buf",
|
||||
[PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null",
|
||||
[PTR_TO_FUNC] = "func",
|
||||
[PTR_TO_MAP_KEY] = "map_key",
|
||||
};
|
||||
/* string representation of 'enum bpf_reg_type'
|
||||
*
|
||||
* Note that reg_type_str() can not appear more than once in a single verbose()
|
||||
* statement.
|
||||
*/
|
||||
static const char *reg_type_str(struct bpf_verifier_env *env,
|
||||
enum bpf_reg_type type)
|
||||
{
|
||||
char postfix[16] = {0}, prefix[16] = {0};
|
||||
static const char * const str[] = {
|
||||
[NOT_INIT] = "?",
|
||||
[SCALAR_VALUE] = "inv",
|
||||
[PTR_TO_CTX] = "ctx",
|
||||
[CONST_PTR_TO_MAP] = "map_ptr",
|
||||
[PTR_TO_MAP_VALUE] = "map_value",
|
||||
[PTR_TO_STACK] = "fp",
|
||||
[PTR_TO_PACKET] = "pkt",
|
||||
[PTR_TO_PACKET_META] = "pkt_meta",
|
||||
[PTR_TO_PACKET_END] = "pkt_end",
|
||||
[PTR_TO_FLOW_KEYS] = "flow_keys",
|
||||
[PTR_TO_SOCKET] = "sock",
|
||||
[PTR_TO_SOCK_COMMON] = "sock_common",
|
||||
[PTR_TO_TCP_SOCK] = "tcp_sock",
|
||||
[PTR_TO_TP_BUFFER] = "tp_buffer",
|
||||
[PTR_TO_XDP_SOCK] = "xdp_sock",
|
||||
[PTR_TO_BTF_ID] = "ptr_",
|
||||
[PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_",
|
||||
[PTR_TO_MEM] = "mem",
|
||||
[PTR_TO_BUF] = "buf",
|
||||
[PTR_TO_FUNC] = "func",
|
||||
[PTR_TO_MAP_KEY] = "map_key",
|
||||
};
|
||||
|
||||
if (type & PTR_MAYBE_NULL) {
|
||||
if (base_type(type) == PTR_TO_BTF_ID ||
|
||||
base_type(type) == PTR_TO_PERCPU_BTF_ID)
|
||||
strncpy(postfix, "or_null_", 16);
|
||||
else
|
||||
strncpy(postfix, "_or_null", 16);
|
||||
}
|
||||
|
||||
if (type & MEM_RDONLY)
|
||||
strncpy(prefix, "rdonly_", 16);
|
||||
|
||||
snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",
|
||||
prefix, str[base_type(type)], postfix);
|
||||
return env->type_str_buf;
|
||||
}
|
||||
|
||||
static char slot_type_char[] = {
|
||||
[STACK_INVALID] = '?',
|
||||
@@ -628,7 +628,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
|
||||
continue;
|
||||
verbose(env, " R%d", i);
|
||||
print_liveness(env, reg->live);
|
||||
verbose(env, "=%s", reg_type_str[t]);
|
||||
verbose(env, "=%s", reg_type_str(env, t));
|
||||
if (t == SCALAR_VALUE && reg->precise)
|
||||
verbose(env, "P");
|
||||
if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
|
||||
@@ -636,9 +636,8 @@ static void print_verifier_state(struct bpf_verifier_env *env,
|
||||
/* reg->off should be 0 for SCALAR_VALUE */
|
||||
verbose(env, "%lld", reg->var_off.value + reg->off);
|
||||
} else {
|
||||
if (t == PTR_TO_BTF_ID ||
|
||||
t == PTR_TO_BTF_ID_OR_NULL ||
|
||||
t == PTR_TO_PERCPU_BTF_ID)
|
||||
if (base_type(t) == PTR_TO_BTF_ID ||
|
||||
base_type(t) == PTR_TO_PERCPU_BTF_ID)
|
||||
verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id));
|
||||
verbose(env, "(id=%d", reg->id);
|
||||
if (reg_type_may_be_refcounted_or_null(t))
|
||||
@@ -647,10 +646,9 @@ static void print_verifier_state(struct bpf_verifier_env *env,
|
||||
verbose(env, ",off=%d", reg->off);
|
||||
if (type_is_pkt_pointer(t))
|
||||
verbose(env, ",r=%d", reg->range);
|
||||
else if (t == CONST_PTR_TO_MAP ||
|
||||
t == PTR_TO_MAP_KEY ||
|
||||
t == PTR_TO_MAP_VALUE ||
|
||||
t == PTR_TO_MAP_VALUE_OR_NULL)
|
||||
else if (base_type(t) == CONST_PTR_TO_MAP ||
|
||||
base_type(t) == PTR_TO_MAP_KEY ||
|
||||
base_type(t) == PTR_TO_MAP_VALUE)
|
||||
verbose(env, ",ks=%d,vs=%d",
|
||||
reg->map_ptr->key_size,
|
||||
reg->map_ptr->value_size);
|
||||
@@ -720,7 +718,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
|
||||
if (state->stack[i].slot_type[0] == STACK_SPILL) {
|
||||
reg = &state->stack[i].spilled_ptr;
|
||||
t = reg->type;
|
||||
verbose(env, "=%s", reg_type_str[t]);
|
||||
verbose(env, "=%s", reg_type_str(env, t));
|
||||
if (t == SCALAR_VALUE && reg->precise)
|
||||
verbose(env, "P");
|
||||
if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
|
||||
@@ -1133,8 +1131,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,
|
||||
|
||||
static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
|
||||
{
|
||||
switch (reg->type) {
|
||||
case PTR_TO_MAP_VALUE_OR_NULL: {
|
||||
if (base_type(reg->type) == PTR_TO_MAP_VALUE) {
|
||||
const struct bpf_map *map = reg->map_ptr;
|
||||
|
||||
if (map->inner_map_meta) {
|
||||
@@ -1153,32 +1150,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
|
||||
} else {
|
||||
reg->type = PTR_TO_MAP_VALUE;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case PTR_TO_SOCKET_OR_NULL:
|
||||
reg->type = PTR_TO_SOCKET;
|
||||
break;
|
||||
case PTR_TO_SOCK_COMMON_OR_NULL:
|
||||
reg->type = PTR_TO_SOCK_COMMON;
|
||||
break;
|
||||
case PTR_TO_TCP_SOCK_OR_NULL:
|
||||
reg->type = PTR_TO_TCP_SOCK;
|
||||
break;
|
||||
case PTR_TO_BTF_ID_OR_NULL:
|
||||
reg->type = PTR_TO_BTF_ID;
|
||||
break;
|
||||
case PTR_TO_MEM_OR_NULL:
|
||||
reg->type = PTR_TO_MEM;
|
||||
break;
|
||||
case PTR_TO_RDONLY_BUF_OR_NULL:
|
||||
reg->type = PTR_TO_RDONLY_BUF;
|
||||
break;
|
||||
case PTR_TO_RDWR_BUF_OR_NULL:
|
||||
reg->type = PTR_TO_RDWR_BUF;
|
||||
break;
|
||||
default:
|
||||
WARN_ONCE(1, "unknown nullable register type");
|
||||
return;
|
||||
}
|
||||
|
||||
reg->type &= ~PTR_MAYBE_NULL;
|
||||
}
|
||||
|
||||
static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
|
||||
@@ -1906,7 +1881,7 @@ static int mark_reg_read(struct bpf_verifier_env *env,
|
||||
break;
|
||||
if (parent->live & REG_LIVE_DONE) {
|
||||
verbose(env, "verifier BUG type %s var_off %lld off %d\n",
|
||||
reg_type_str[parent->type],
|
||||
reg_type_str(env, parent->type),
|
||||
parent->var_off.value, parent->off);
|
||||
return -EFAULT;
|
||||
}
|
||||
@@ -2564,9 +2539,8 @@ static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi)
|
||||
|
||||
static bool is_spillable_regtype(enum bpf_reg_type type)
|
||||
{
|
||||
switch (type) {
|
||||
switch (base_type(type)) {
|
||||
case PTR_TO_MAP_VALUE:
|
||||
case PTR_TO_MAP_VALUE_OR_NULL:
|
||||
case PTR_TO_STACK:
|
||||
case PTR_TO_CTX:
|
||||
case PTR_TO_PACKET:
|
||||
@@ -2575,21 +2549,13 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
|
||||
case PTR_TO_FLOW_KEYS:
|
||||
case CONST_PTR_TO_MAP:
|
||||
case PTR_TO_SOCKET:
|
||||
case PTR_TO_SOCKET_OR_NULL:
|
||||
case PTR_TO_SOCK_COMMON:
|
||||
case PTR_TO_SOCK_COMMON_OR_NULL:
|
||||
case PTR_TO_TCP_SOCK:
|
||||
case PTR_TO_TCP_SOCK_OR_NULL:
|
||||
case PTR_TO_XDP_SOCK:
|
||||
case PTR_TO_BTF_ID:
|
||||
case PTR_TO_BTF_ID_OR_NULL:
|
||||
case PTR_TO_RDONLY_BUF:
|
||||
case PTR_TO_RDONLY_BUF_OR_NULL:
|
||||
case PTR_TO_RDWR_BUF:
|
||||
case PTR_TO_RDWR_BUF_OR_NULL:
|
||||
case PTR_TO_BUF:
|
||||
case PTR_TO_PERCPU_BTF_ID:
|
||||
case PTR_TO_MEM:
|
||||
case PTR_TO_MEM_OR_NULL:
|
||||
case PTR_TO_FUNC:
|
||||
case PTR_TO_MAP_KEY:
|
||||
return true;
|
||||
@@ -3405,7 +3371,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
|
||||
*/
|
||||
*reg_type = info.reg_type;
|
||||
|
||||
if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) {
|
||||
if (base_type(*reg_type) == PTR_TO_BTF_ID) {
|
||||
*btf = info.btf;
|
||||
*btf_id = info.btf_id;
|
||||
} else {
|
||||
@@ -3473,7 +3439,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
|
||||
}
|
||||
|
||||
verbose(env, "R%d invalid %s access off=%d size=%d\n",
|
||||
regno, reg_type_str[reg->type], off, size);
|
||||
regno, reg_type_str(env, reg->type), off, size);
|
||||
|
||||
return -EACCES;
|
||||
}
|
||||
@@ -4200,15 +4166,30 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
|
||||
mark_reg_unknown(env, regs, value_regno);
|
||||
}
|
||||
}
|
||||
} else if (reg->type == PTR_TO_MEM) {
|
||||
} else if (base_type(reg->type) == PTR_TO_MEM) {
|
||||
bool rdonly_mem = type_is_rdonly_mem(reg->type);
|
||||
|
||||
if (type_may_be_null(reg->type)) {
|
||||
verbose(env, "R%d invalid mem access '%s'\n", regno,
|
||||
reg_type_str(env, reg->type));
|
||||
return -EACCES;
|
||||
}
|
||||
|
||||
if (t == BPF_WRITE && rdonly_mem) {
|
||||
verbose(env, "R%d cannot write into %s\n",
|
||||
regno, reg_type_str(env, reg->type));
|
||||
return -EACCES;
|
||||
}
|
||||
|
||||
if (t == BPF_WRITE && value_regno >= 0 &&
|
||||
is_pointer_value(env, value_regno)) {
|
||||
verbose(env, "R%d leaks addr into mem\n", value_regno);
|
||||
return -EACCES;
|
||||
}
|
||||
|
||||
err = check_mem_region_access(env, regno, off, size,
|
||||
reg->mem_size, false);
|
||||
if (!err && t == BPF_READ && value_regno >= 0)
|
||||
if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem))
|
||||
mark_reg_unknown(env, regs, value_regno);
|
||||
} else if (reg->type == PTR_TO_CTX) {
|
||||
enum bpf_reg_type reg_type = SCALAR_VALUE;
|
||||
@@ -4238,7 +4219,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
|
||||
} else {
|
||||
mark_reg_known_zero(env, regs,
|
||||
value_regno);
|
||||
if (reg_type_may_be_null(reg_type))
|
||||
if (type_may_be_null(reg_type))
|
||||
regs[value_regno].id = ++env->id_gen;
|
||||
/* A load of ctx field could have different
|
||||
* actual load size with the one encoded in the
|
||||
@@ -4246,8 +4227,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
|
||||
* a sub-register.
|
||||
*/
|
||||
regs[value_regno].subreg_def = DEF_NOT_SUBREG;
|
||||
if (reg_type == PTR_TO_BTF_ID ||
|
||||
reg_type == PTR_TO_BTF_ID_OR_NULL) {
|
||||
if (base_type(reg_type) == PTR_TO_BTF_ID) {
|
||||
regs[value_regno].btf = btf;
|
||||
regs[value_regno].btf_id = btf_id;
|
||||
}
|
||||
@@ -4300,7 +4280,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
|
||||
} else if (type_is_sk_pointer(reg->type)) {
|
||||
if (t == BPF_WRITE) {
|
||||
verbose(env, "R%d cannot write into %s\n",
|
||||
regno, reg_type_str[reg->type]);
|
||||
regno, reg_type_str(env, reg->type));
|
||||
return -EACCES;
|
||||
}
|
||||
err = check_sock_access(env, insn_idx, regno, off, size, t);
|
||||
@@ -4316,26 +4296,32 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
|
||||
} else if (reg->type == CONST_PTR_TO_MAP) {
|
||||
err = check_ptr_to_map_access(env, regs, regno, off, size, t,
|
||||
value_regno);
|
||||
} else if (reg->type == PTR_TO_RDONLY_BUF) {
|
||||
if (t == BPF_WRITE) {
|
||||
verbose(env, "R%d cannot write into %s\n",
|
||||
regno, reg_type_str[reg->type]);
|
||||
return -EACCES;
|
||||
} else if (base_type(reg->type) == PTR_TO_BUF) {
|
||||
bool rdonly_mem = type_is_rdonly_mem(reg->type);
|
||||
const char *buf_info;
|
||||
u32 *max_access;
|
||||
|
||||
if (rdonly_mem) {
|
||||
if (t == BPF_WRITE) {
|
||||
verbose(env, "R%d cannot write into %s\n",
|
||||
regno, reg_type_str(env, reg->type));
|
||||
return -EACCES;
|
||||
}
|
||||
buf_info = "rdonly";
|
||||
max_access = &env->prog->aux->max_rdonly_access;
|
||||
} else {
|
||||
buf_info = "rdwr";
|
||||
max_access = &env->prog->aux->max_rdwr_access;
|
||||
}
|
||||
|
||||
err = check_buffer_access(env, reg, regno, off, size, false,
|
||||
"rdonly",
|
||||
&env->prog->aux->max_rdonly_access);
|
||||
if (!err && value_regno >= 0)
|
||||
mark_reg_unknown(env, regs, value_regno);
|
||||
} else if (reg->type == PTR_TO_RDWR_BUF) {
|
||||
err = check_buffer_access(env, reg, regno, off, size, false,
|
||||
"rdwr",
|
||||
&env->prog->aux->max_rdwr_access);
|
||||
if (!err && t == BPF_READ && value_regno >= 0)
|
||||
buf_info, max_access);
|
||||
|
||||
if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ))
|
||||
mark_reg_unknown(env, regs, value_regno);
|
||||
} else {
|
||||
verbose(env, "R%d invalid mem access '%s'\n", regno,
|
||||
reg_type_str[reg->type]);
|
||||
reg_type_str(env, reg->type));
|
||||
return -EACCES;
|
||||
}
|
||||
|
||||
@@ -4409,7 +4395,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
|
||||
is_sk_reg(env, insn->dst_reg)) {
|
||||
verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
|
||||
insn->dst_reg,
|
||||
reg_type_str[reg_state(env, insn->dst_reg)->type]);
|
||||
reg_type_str(env, reg_state(env, insn->dst_reg)->type));
|
||||
return -EACCES;
|
||||
}
|
||||
|
||||
@@ -4592,8 +4578,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
|
||||
struct bpf_call_arg_meta *meta)
|
||||
{
|
||||
struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno];
|
||||
const char *buf_info;
|
||||
u32 *max_access;
|
||||
|
||||
switch (reg->type) {
|
||||
switch (base_type(reg->type)) {
|
||||
case PTR_TO_PACKET:
|
||||
case PTR_TO_PACKET_META:
|
||||
return check_packet_access(env, regno, reg->off, access_size,
|
||||
@@ -4612,18 +4600,20 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
|
||||
return check_mem_region_access(env, regno, reg->off,
|
||||
access_size, reg->mem_size,
|
||||
zero_size_allowed);
|
||||
case PTR_TO_RDONLY_BUF:
|
||||
if (meta && meta->raw_mode)
|
||||
return -EACCES;
|
||||
case PTR_TO_BUF:
|
||||
if (type_is_rdonly_mem(reg->type)) {
|
||||
if (meta && meta->raw_mode)
|
||||
return -EACCES;
|
||||
|
||||
buf_info = "rdonly";
|
||||
max_access = &env->prog->aux->max_rdonly_access;
|
||||
} else {
|
||||
buf_info = "rdwr";
|
||||
max_access = &env->prog->aux->max_rdwr_access;
|
||||
}
|
||||
return check_buffer_access(env, reg, regno, reg->off,
|
||||
access_size, zero_size_allowed,
|
||||
"rdonly",
|
||||
&env->prog->aux->max_rdonly_access);
|
||||
case PTR_TO_RDWR_BUF:
|
||||
return check_buffer_access(env, reg, regno, reg->off,
|
||||
access_size, zero_size_allowed,
|
||||
"rdwr",
|
||||
&env->prog->aux->max_rdwr_access);
|
||||
buf_info, max_access);
|
||||
case PTR_TO_STACK:
|
||||
return check_stack_range_initialized(
|
||||
env,
|
||||
@@ -4635,9 +4625,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
|
||||
register_is_null(reg))
|
||||
return 0;
|
||||
|
||||
verbose(env, "R%d type=%s expected=%s\n", regno,
|
||||
reg_type_str[reg->type],
|
||||
reg_type_str[PTR_TO_STACK]);
|
||||
verbose(env, "R%d type=%s ", regno,
|
||||
reg_type_str(env, reg->type));
|
||||
verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK));
|
||||
return -EACCES;
|
||||
}
|
||||
}
|
||||
@@ -4648,7 +4638,7 @@ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
|
||||
if (register_is_null(reg))
|
||||
return 0;
|
||||
|
||||
if (reg_type_may_be_null(reg->type)) {
|
||||
if (type_may_be_null(reg->type)) {
|
||||
/* Assuming that the register contains a value check if the memory
|
||||
* access is safe. Temporarily save and restore the register's state as
|
||||
* the conversion shouldn't be visible to a caller.
|
||||
@@ -4796,9 +4786,8 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,
|
||||
|
||||
static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
|
||||
{
|
||||
return type == ARG_PTR_TO_MEM ||
|
||||
type == ARG_PTR_TO_MEM_OR_NULL ||
|
||||
type == ARG_PTR_TO_UNINIT_MEM;
|
||||
return base_type(type) == ARG_PTR_TO_MEM ||
|
||||
base_type(type) == ARG_PTR_TO_UNINIT_MEM;
|
||||
}
|
||||
|
||||
static bool arg_type_is_mem_size(enum bpf_arg_type type)
|
||||
@@ -4900,8 +4889,7 @@ static const struct bpf_reg_types mem_types = {
|
||||
PTR_TO_MAP_KEY,
|
||||
PTR_TO_MAP_VALUE,
|
||||
PTR_TO_MEM,
|
||||
PTR_TO_RDONLY_BUF,
|
||||
PTR_TO_RDWR_BUF,
|
||||
PTR_TO_BUF,
|
||||
},
|
||||
};
|
||||
|
||||
@@ -4932,31 +4920,26 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
|
||||
[ARG_PTR_TO_MAP_KEY] = &map_key_value_types,
|
||||
[ARG_PTR_TO_MAP_VALUE] = &map_key_value_types,
|
||||
[ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types,
|
||||
[ARG_PTR_TO_MAP_VALUE_OR_NULL] = &map_key_value_types,
|
||||
[ARG_CONST_SIZE] = &scalar_types,
|
||||
[ARG_CONST_SIZE_OR_ZERO] = &scalar_types,
|
||||
[ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types,
|
||||
[ARG_CONST_MAP_PTR] = &const_map_ptr_types,
|
||||
[ARG_PTR_TO_CTX] = &context_types,
|
||||
[ARG_PTR_TO_CTX_OR_NULL] = &context_types,
|
||||
[ARG_PTR_TO_SOCK_COMMON] = &sock_types,
|
||||
#ifdef CONFIG_NET
|
||||
[ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types,
|
||||
#endif
|
||||
[ARG_PTR_TO_SOCKET] = &fullsock_types,
|
||||
[ARG_PTR_TO_SOCKET_OR_NULL] = &fullsock_types,
|
||||
[ARG_PTR_TO_BTF_ID] = &btf_ptr_types,
|
||||
[ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types,
|
||||
[ARG_PTR_TO_MEM] = &mem_types,
|
||||
[ARG_PTR_TO_MEM_OR_NULL] = &mem_types,
|
||||
[ARG_PTR_TO_UNINIT_MEM] = &mem_types,
|
||||
[ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types,
|
||||
[ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types,
|
||||
[ARG_PTR_TO_INT] = &int_ptr_types,
|
||||
[ARG_PTR_TO_LONG] = &int_ptr_types,
|
||||
[ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types,
|
||||
[ARG_PTR_TO_FUNC] = &func_ptr_types,
|
||||
[ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types,
|
||||
[ARG_PTR_TO_STACK] = &stack_ptr_types,
|
||||
[ARG_PTR_TO_CONST_STR] = &const_str_ptr_types,
|
||||
[ARG_PTR_TO_TIMER] = &timer_types,
|
||||
};
|
||||
@@ -4970,12 +4953,27 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
|
||||
const struct bpf_reg_types *compatible;
|
||||
int i, j;
|
||||
|
||||
compatible = compatible_reg_types[arg_type];
|
||||
compatible = compatible_reg_types[base_type(arg_type)];
|
||||
if (!compatible) {
|
||||
verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type);
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
/* ARG_PTR_TO_MEM + RDONLY is compatible with PTR_TO_MEM and PTR_TO_MEM + RDONLY,
|
||||
* but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM and NOT with PTR_TO_MEM + RDONLY
|
||||
*
|
||||
* Same for MAYBE_NULL:
|
||||
*
|
||||
* ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL,
|
||||
* but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL
|
||||
*
|
||||
* Therefore we fold these flags depending on the arg_type before comparison.
|
||||
*/
|
||||
if (arg_type & MEM_RDONLY)
|
||||
type &= ~MEM_RDONLY;
|
||||
if (arg_type & PTR_MAYBE_NULL)
|
||||
type &= ~PTR_MAYBE_NULL;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
|
||||
expected = compatible->types[i];
|
||||
if (expected == NOT_INIT)
|
||||
@@ -4985,14 +4983,14 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
|
||||
goto found;
|
||||
}
|
||||
|
||||
verbose(env, "R%d type=%s expected=", regno, reg_type_str[type]);
|
||||
verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, reg->type));
|
||||
for (j = 0; j + 1 < i; j++)
|
||||
verbose(env, "%s, ", reg_type_str[compatible->types[j]]);
|
||||
verbose(env, "%s\n", reg_type_str[compatible->types[j]]);
|
||||
verbose(env, "%s, ", reg_type_str(env, compatible->types[j]));
|
||||
verbose(env, "%s\n", reg_type_str(env, compatible->types[j]));
|
||||
return -EACCES;
|
||||
|
||||
found:
|
||||
if (type == PTR_TO_BTF_ID) {
|
||||
if (reg->type == PTR_TO_BTF_ID) {
|
||||
if (!arg_btf_id) {
|
||||
if (!compatible->btf_id) {
|
||||
verbose(env, "verifier internal error: missing arg compatible BTF ID\n");
|
||||
@@ -5051,15 +5049,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
|
||||
return -EACCES;
|
||||
}
|
||||
|
||||
if (arg_type == ARG_PTR_TO_MAP_VALUE ||
|
||||
arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE ||
|
||||
arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) {
|
||||
if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE ||
|
||||
base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) {
|
||||
err = resolve_map_arg_type(env, meta, &arg_type);
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
||||
if (register_is_null(reg) && arg_type_may_be_null(arg_type))
|
||||
if (register_is_null(reg) && type_may_be_null(arg_type))
|
||||
/* A NULL register has a SCALAR_VALUE type, so skip
|
||||
* type checking.
|
||||
*/
|
||||
@@ -5128,10 +5125,11 @@ skip_type_check:
|
||||
err = check_helper_mem_access(env, regno,
|
||||
meta->map_ptr->key_size, false,
|
||||
NULL);
|
||||
} else if (arg_type == ARG_PTR_TO_MAP_VALUE ||
|
||||
(arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL &&
|
||||
!register_is_null(reg)) ||
|
||||
arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) {
|
||||
} else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE ||
|
||||
base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) {
|
||||
if (type_may_be_null(arg_type) && register_is_null(reg))
|
||||
return 0;
|
||||
|
||||
/* bpf_map_xxx(..., map_ptr, ..., value) call:
|
||||
* check [value, value + map->value_size) validity
|
||||
*/
|
||||
@@ -6206,6 +6204,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
|
||||
int *insn_idx_p)
|
||||
{
|
||||
const struct bpf_func_proto *fn = NULL;
|
||||
enum bpf_return_type ret_type;
|
||||
enum bpf_type_flag ret_flag;
|
||||
struct bpf_reg_state *regs;
|
||||
struct bpf_call_arg_meta meta;
|
||||
int insn_idx = *insn_idx_p;
|
||||
@@ -6339,13 +6339,14 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
|
||||
regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
|
||||
|
||||
/* update return register (already marked as written above) */
|
||||
if (fn->ret_type == RET_INTEGER) {
|
||||
ret_type = fn->ret_type;
|
||||
ret_flag = type_flag(fn->ret_type);
|
||||
if (ret_type == RET_INTEGER) {
|
||||
/* sets type to SCALAR_VALUE */
|
||||
mark_reg_unknown(env, regs, BPF_REG_0);
|
||||
} else if (fn->ret_type == RET_VOID) {
|
||||
} else if (ret_type == RET_VOID) {
|
||||
regs[BPF_REG_0].type = NOT_INIT;
|
||||
} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL ||
|
||||
fn->ret_type == RET_PTR_TO_MAP_VALUE) {
|
||||
} else if (base_type(ret_type) == RET_PTR_TO_MAP_VALUE) {
|
||||
/* There is no offset yet applied, variable or fixed */
|
||||
mark_reg_known_zero(env, regs, BPF_REG_0);
|
||||
/* remember map_ptr, so that check_map_access()
|
||||
@@ -6359,28 +6360,25 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
|
||||
}
|
||||
regs[BPF_REG_0].map_ptr = meta.map_ptr;
|
||||
regs[BPF_REG_0].map_uid = meta.map_uid;
|
||||
if (fn->ret_type == RET_PTR_TO_MAP_VALUE) {
|
||||
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
|
||||
if (map_value_has_spin_lock(meta.map_ptr))
|
||||
regs[BPF_REG_0].id = ++env->id_gen;
|
||||
} else {
|
||||
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
|
||||
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;
|
||||
if (!type_may_be_null(ret_type) &&
|
||||
map_value_has_spin_lock(meta.map_ptr)) {
|
||||
regs[BPF_REG_0].id = ++env->id_gen;
|
||||
}
|
||||
} else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {
|
||||
} else if (base_type(ret_type) == RET_PTR_TO_SOCKET) {
|
||||
mark_reg_known_zero(env, regs, BPF_REG_0);
|
||||
regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
|
||||
} else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) {
|
||||
regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag;
|
||||
} else if (base_type(ret_type) == RET_PTR_TO_SOCK_COMMON) {
|
||||
mark_reg_known_zero(env, regs, BPF_REG_0);
|
||||
regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL;
|
||||
} else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) {
|
||||
regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag;
|
||||
} else if (base_type(ret_type) == RET_PTR_TO_TCP_SOCK) {
|
||||
mark_reg_known_zero(env, regs, BPF_REG_0);
|
||||
regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
|
||||
} else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) {
|
||||
regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
|
||||
} else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) {
|
||||
mark_reg_known_zero(env, regs, BPF_REG_0);
|
||||
regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL;
|
||||
regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
|
||||
regs[BPF_REG_0].mem_size = meta.mem_size;
|
||||
} else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL ||
|
||||
fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) {
|
||||
} else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) {
|
||||
const struct btf_type *t;
|
||||
|
||||
mark_reg_known_zero(env, regs, BPF_REG_0);
|
||||
@@ -6398,29 +6396,30 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
|
||||
tname, PTR_ERR(ret));
|
||||
return -EINVAL;
|
||||
}
|
||||
regs[BPF_REG_0].type =
|
||||
fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
|
||||
PTR_TO_MEM : PTR_TO_MEM_OR_NULL;
|
||||
regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
|
||||
regs[BPF_REG_0].mem_size = tsize;
|
||||
} else {
|
||||
regs[BPF_REG_0].type =
|
||||
fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
|
||||
PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL;
|
||||
/* MEM_RDONLY may be carried from ret_flag, but it
|
||||
* doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise
|
||||
* it will confuse the check of PTR_TO_BTF_ID in
|
||||
* check_mem_access().
|
||||
*/
|
||||
ret_flag &= ~MEM_RDONLY;
|
||||
|
||||
regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
|
||||
regs[BPF_REG_0].btf = meta.ret_btf;
|
||||
regs[BPF_REG_0].btf_id = meta.ret_btf_id;
|
||||
}
|
||||
} else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL ||
|
||||
fn->ret_type == RET_PTR_TO_BTF_ID) {
|
||||
} else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) {
|
||||
int ret_btf_id;
|
||||
|
||||
mark_reg_known_zero(env, regs, BPF_REG_0);
|
||||
regs[BPF_REG_0].type = fn->ret_type == RET_PTR_TO_BTF_ID ?
|
||||
PTR_TO_BTF_ID :
|
||||
PTR_TO_BTF_ID_OR_NULL;
|
||||
regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
|
||||
ret_btf_id = *fn->ret_btf_id;
|
||||
if (ret_btf_id == 0) {
|
||||
verbose(env, "invalid return type %d of func %s#%d\n",
|
||||
fn->ret_type, func_id_name(func_id), func_id);
|
||||
verbose(env, "invalid return type %u of func %s#%d\n",
|
||||
base_type(ret_type), func_id_name(func_id),
|
||||
func_id);
|
||||
return -EINVAL;
|
||||
}
|
||||
/* current BPF helper definitions are only coming from
|
||||
@@ -6429,12 +6428,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
|
||||
regs[BPF_REG_0].btf = btf_vmlinux;
|
||||
regs[BPF_REG_0].btf_id = ret_btf_id;
|
||||
} else {
|
||||
verbose(env, "unknown return type %d of func %s#%d\n",
|
||||
fn->ret_type, func_id_name(func_id), func_id);
|
||||
verbose(env, "unknown return type %u of func %s#%d\n",
|
||||
base_type(ret_type), func_id_name(func_id), func_id);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (reg_type_may_be_null(regs[BPF_REG_0].type))
|
||||
if (type_may_be_null(regs[BPF_REG_0].type))
|
||||
regs[BPF_REG_0].id = ++env->id_gen;
|
||||
|
||||
if (is_ptr_cast_function(func_id)) {
|
||||
@@ -6633,25 +6632,25 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env,
|
||||
|
||||
if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
|
||||
verbose(env, "math between %s pointer and %lld is not allowed\n",
|
||||
reg_type_str[type], val);
|
||||
reg_type_str(env, type), val);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) {
|
||||
verbose(env, "%s pointer offset %d is not allowed\n",
|
||||
reg_type_str[type], reg->off);
|
||||
reg_type_str(env, type), reg->off);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (smin == S64_MIN) {
|
||||
verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n",
|
||||
reg_type_str[type]);
|
||||
reg_type_str(env, type));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
|
||||
verbose(env, "value %lld makes %s pointer be out of bounds\n",
|
||||
smin, reg_type_str[type]);
|
||||
smin, reg_type_str(env, type));
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -7028,11 +7027,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
|
||||
return -EACCES;
|
||||
}
|
||||
|
||||
switch (ptr_reg->type) {
|
||||
case PTR_TO_MAP_VALUE_OR_NULL:
|
||||
if (ptr_reg->type & PTR_MAYBE_NULL) {
|
||||
verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n",
|
||||
dst, reg_type_str[ptr_reg->type]);
|
||||
dst, reg_type_str(env, ptr_reg->type));
|
||||
return -EACCES;
|
||||
}
|
||||
|
||||
switch (base_type(ptr_reg->type)) {
|
||||
case CONST_PTR_TO_MAP:
|
||||
/* smin_val represents the known value */
|
||||
if (known && smin_val == 0 && opcode == BPF_ADD)
|
||||
@@ -7045,10 +7046,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
|
||||
case PTR_TO_XDP_SOCK:
|
||||
reject:
|
||||
verbose(env, "R%d pointer arithmetic on %s prohibited\n",
|
||||
dst, reg_type_str[ptr_reg->type]);
|
||||
dst, reg_type_str(env, ptr_reg->type));
|
||||
return -EACCES;
|
||||
default:
|
||||
if (reg_type_may_be_null(ptr_reg->type))
|
||||
if (type_may_be_null(ptr_reg->type))
|
||||
goto reject;
|
||||
break;
|
||||
}
|
||||
@@ -8770,7 +8771,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
|
||||
struct bpf_reg_state *reg, u32 id,
|
||||
bool is_null)
|
||||
{
|
||||
if (reg_type_may_be_null(reg->type) && reg->id == id &&
|
||||
if (type_may_be_null(reg->type) && reg->id == id &&
|
||||
!WARN_ON_ONCE(!reg->id)) {
|
||||
if (WARN_ON_ONCE(reg->smin_value || reg->smax_value ||
|
||||
!tnum_equals_const(reg->var_off, 0) ||
|
||||
@@ -9148,7 +9149,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
|
||||
*/
|
||||
if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K &&
|
||||
insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
|
||||
reg_type_may_be_null(dst_reg->type)) {
|
||||
type_may_be_null(dst_reg->type)) {
|
||||
/* Mark all identical registers in each branch as either
|
||||
* safe or unknown depending R == 0 or R != 0 conditional.
|
||||
*/
|
||||
@@ -9207,7 +9208,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
|
||||
|
||||
if (insn->src_reg == BPF_PSEUDO_BTF_ID) {
|
||||
dst_reg->type = aux->btf_var.reg_type;
|
||||
switch (dst_reg->type) {
|
||||
switch (base_type(dst_reg->type)) {
|
||||
case PTR_TO_MEM:
|
||||
dst_reg->mem_size = aux->btf_var.mem_size;
|
||||
break;
|
||||
@@ -9404,7 +9405,7 @@ static int check_return_code(struct bpf_verifier_env *env)
|
||||
/* enforce return zero from async callbacks like timer */
|
||||
if (reg->type != SCALAR_VALUE) {
|
||||
verbose(env, "In async callback the register R0 is not a known value (%s)\n",
|
||||
reg_type_str[reg->type]);
|
||||
reg_type_str(env, reg->type));
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
@@ -9418,7 +9419,7 @@ static int check_return_code(struct bpf_verifier_env *env)
|
||||
if (is_subprog) {
|
||||
if (reg->type != SCALAR_VALUE) {
|
||||
verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n",
|
||||
reg_type_str[reg->type]);
|
||||
reg_type_str(env, reg->type));
|
||||
return -EINVAL;
|
||||
}
|
||||
return 0;
|
||||
@@ -9482,7 +9483,7 @@ static int check_return_code(struct bpf_verifier_env *env)
|
||||
|
||||
if (reg->type != SCALAR_VALUE) {
|
||||
verbose(env, "At program exit the register R0 is not a known value (%s)\n",
|
||||
reg_type_str[reg->type]);
|
||||
reg_type_str(env, reg->type));
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
@@ -10263,7 +10264,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
|
||||
return true;
|
||||
if (rcur->type == NOT_INIT)
|
||||
return false;
|
||||
switch (rold->type) {
|
||||
switch (base_type(rold->type)) {
|
||||
case SCALAR_VALUE:
|
||||
if (env->explore_alu_limits)
|
||||
return false;
|
||||
@@ -10285,6 +10286,22 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
|
||||
}
|
||||
case PTR_TO_MAP_KEY:
|
||||
case PTR_TO_MAP_VALUE:
|
||||
/* a PTR_TO_MAP_VALUE could be safe to use as a
|
||||
* PTR_TO_MAP_VALUE_OR_NULL into the same map.
|
||||
* However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-
|
||||
* checked, doing so could have affected others with the same
|
||||
* id, and we can't check for that because we lost the id when
|
||||
* we converted to a PTR_TO_MAP_VALUE.
|
||||
*/
|
||||
if (type_may_be_null(rold->type)) {
|
||||
if (!type_may_be_null(rcur->type))
|
||||
return false;
|
||||
if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)))
|
||||
return false;
|
||||
/* Check our ids match any regs they're supposed to */
|
||||
return check_ids(rold->id, rcur->id, idmap);
|
||||
}
|
||||
|
||||
/* If the new min/max/var_off satisfy the old ones and
|
||||
* everything else matches, we are OK.
|
||||
* 'id' is not compared, since it's only used for maps with
|
||||
@@ -10296,20 +10313,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
|
||||
return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
|
||||
range_within(rold, rcur) &&
|
||||
tnum_in(rold->var_off, rcur->var_off);
|
||||
case PTR_TO_MAP_VALUE_OR_NULL:
|
||||
/* a PTR_TO_MAP_VALUE could be safe to use as a
|
||||
* PTR_TO_MAP_VALUE_OR_NULL into the same map.
|
||||
* However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-
|
||||
* checked, doing so could have affected others with the same
|
||||
* id, and we can't check for that because we lost the id when
|
||||
* we converted to a PTR_TO_MAP_VALUE.
|
||||
*/
|
||||
if (rcur->type != PTR_TO_MAP_VALUE_OR_NULL)
|
||||
return false;
|
||||
if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)))
|
||||
return false;
|
||||
/* Check our ids match any regs they're supposed to */
|
||||
return check_ids(rold->id, rcur->id, idmap);
|
||||
case PTR_TO_PACKET_META:
|
||||
case PTR_TO_PACKET:
|
||||
if (rcur->type != rold->type)
|
||||
@@ -10338,11 +10341,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
|
||||
case PTR_TO_PACKET_END:
|
||||
case PTR_TO_FLOW_KEYS:
|
||||
case PTR_TO_SOCKET:
|
||||
case PTR_TO_SOCKET_OR_NULL:
|
||||
case PTR_TO_SOCK_COMMON:
|
||||
case PTR_TO_SOCK_COMMON_OR_NULL:
|
||||
case PTR_TO_TCP_SOCK:
|
||||
case PTR_TO_TCP_SOCK_OR_NULL:
|
||||
case PTR_TO_XDP_SOCK:
|
||||
/* Only valid matches are exact, which memcmp() above
|
||||
* would have accepted
|
||||
@@ -10868,17 +10868,13 @@ next:
|
||||
/* Return true if it's OK to have the same insn return a different type. */
|
||||
static bool reg_type_mismatch_ok(enum bpf_reg_type type)
|
||||
{
|
||||
switch (type) {
|
||||
switch (base_type(type)) {
|
||||
case PTR_TO_CTX:
|
||||
case PTR_TO_SOCKET:
|
||||
case PTR_TO_SOCKET_OR_NULL:
|
||||
case PTR_TO_SOCK_COMMON:
|
||||
case PTR_TO_SOCK_COMMON_OR_NULL:
|
||||
case PTR_TO_TCP_SOCK:
|
||||
case PTR_TO_TCP_SOCK_OR_NULL:
|
||||
case PTR_TO_XDP_SOCK:
|
||||
case PTR_TO_BTF_ID:
|
||||
case PTR_TO_BTF_ID_OR_NULL:
|
||||
return false;
|
||||
default:
|
||||
return true;
|
||||
@@ -11102,7 +11098,7 @@ static int do_check(struct bpf_verifier_env *env)
|
||||
if (is_ctx_reg(env, insn->dst_reg)) {
|
||||
verbose(env, "BPF_ST stores into R%d %s is not allowed\n",
|
||||
insn->dst_reg,
|
||||
reg_type_str[reg_state(env, insn->dst_reg)->type]);
|
||||
reg_type_str(env, reg_state(env, insn->dst_reg)->type));
|
||||
return -EACCES;
|
||||
}
|
||||
|
||||
@@ -11353,7 +11349,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
|
||||
err = -EINVAL;
|
||||
goto err_put;
|
||||
}
|
||||
aux->btf_var.reg_type = PTR_TO_MEM;
|
||||
aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
|
||||
aux->btf_var.mem_size = tsize;
|
||||
} else {
|
||||
aux->btf_var.reg_type = PTR_TO_BTF_ID;
|
||||
@@ -13175,7 +13171,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
|
||||
mark_reg_known_zero(env, regs, i);
|
||||
else if (regs[i].type == SCALAR_VALUE)
|
||||
mark_reg_unknown(env, regs, i);
|
||||
else if (regs[i].type == PTR_TO_MEM_OR_NULL) {
|
||||
else if (base_type(regs[i].type) == PTR_TO_MEM) {
|
||||
const u32 mem_size = regs[i].mem_size;
|
||||
|
||||
mark_reg_known_zero(env, regs, i);
|
||||
|
||||
@@ -345,7 +345,7 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = {
|
||||
.gpl_only = true,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_ANYTHING,
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE,
|
||||
};
|
||||
|
||||
@@ -394,7 +394,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto = {
|
||||
.func = bpf_trace_printk,
|
||||
.gpl_only = true,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_MEM,
|
||||
.arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg2_type = ARG_CONST_SIZE,
|
||||
};
|
||||
|
||||
@@ -446,9 +446,9 @@ static const struct bpf_func_proto bpf_seq_printf_proto = {
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_BTF_ID,
|
||||
.arg1_btf_id = &btf_seq_file_ids[0],
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE,
|
||||
.arg4_type = ARG_PTR_TO_MEM_OR_NULL,
|
||||
.arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
|
||||
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
};
|
||||
|
||||
@@ -463,7 +463,7 @@ static const struct bpf_func_proto bpf_seq_write_proto = {
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_BTF_ID,
|
||||
.arg1_btf_id = &btf_seq_file_ids[0],
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
};
|
||||
|
||||
@@ -487,7 +487,7 @@ static const struct bpf_func_proto bpf_seq_printf_btf_proto = {
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_BTF_ID,
|
||||
.arg1_btf_id = &btf_seq_file_ids[0],
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
};
|
||||
@@ -648,7 +648,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_CONST_MAP_PTR,
|
||||
.arg3_type = ARG_ANYTHING,
|
||||
.arg4_type = ARG_PTR_TO_MEM,
|
||||
.arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
};
|
||||
|
||||
@@ -958,7 +958,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = {
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_CONST_SIZE,
|
||||
.arg3_type = ARG_PTR_TO_MEM,
|
||||
.arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg4_type = ARG_CONST_SIZE,
|
||||
.arg5_type = ARG_ANYTHING,
|
||||
};
|
||||
@@ -1207,7 +1207,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_CONST_MAP_PTR,
|
||||
.arg3_type = ARG_ANYTHING,
|
||||
.arg4_type = ARG_PTR_TO_MEM,
|
||||
.arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
};
|
||||
|
||||
@@ -1429,7 +1429,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_CONST_MAP_PTR,
|
||||
.arg3_type = ARG_ANYTHING,
|
||||
.arg4_type = ARG_PTR_TO_MEM,
|
||||
.arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
};
|
||||
|
||||
@@ -1483,7 +1483,7 @@ static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
|
||||
.gpl_only = true,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
};
|
||||
|
||||
100
lib/iov_iter.c
100
lib/iov_iter.c
@@ -191,7 +191,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t b
|
||||
buf = iov->iov_base + skip;
|
||||
copy = min(bytes, iov->iov_len - skip);
|
||||
|
||||
if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) {
|
||||
if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_writeable(buf, copy)) {
|
||||
kaddr = kmap_atomic(page);
|
||||
from = kaddr + offset;
|
||||
|
||||
@@ -275,7 +275,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t
|
||||
buf = iov->iov_base + skip;
|
||||
copy = min(bytes, iov->iov_len - skip);
|
||||
|
||||
if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) {
|
||||
if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_readable(buf, copy)) {
|
||||
kaddr = kmap_atomic(page);
|
||||
to = kaddr + offset;
|
||||
|
||||
@@ -431,35 +431,81 @@ out:
|
||||
}
|
||||
|
||||
/*
|
||||
* Fault in one or more iovecs of the given iov_iter, to a maximum length of
|
||||
* bytes. For each iovec, fault in each page that constitutes the iovec.
|
||||
* fault_in_iov_iter_readable - fault in iov iterator for reading
|
||||
* @i: iterator
|
||||
* @size: maximum length
|
||||
*
|
||||
* Return 0 on success, or non-zero if the memory could not be accessed (i.e.
|
||||
* because it is an invalid address).
|
||||
* Fault in one or more iovecs of the given iov_iter, to a maximum length of
|
||||
* @size. For each iovec, fault in each page that constitutes the iovec.
|
||||
*
|
||||
* Returns the number of bytes not faulted in (like copy_to_user() and
|
||||
* copy_from_user()).
|
||||
*
|
||||
* Always returns 0 for non-userspace iterators.
|
||||
*/
|
||||
int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes)
|
||||
size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size)
|
||||
{
|
||||
if (iter_is_iovec(i)) {
|
||||
size_t count = min(size, iov_iter_count(i));
|
||||
const struct iovec *p;
|
||||
size_t skip;
|
||||
|
||||
if (bytes > i->count)
|
||||
bytes = i->count;
|
||||
for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) {
|
||||
size_t len = min(bytes, p->iov_len - skip);
|
||||
int err;
|
||||
size -= count;
|
||||
for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) {
|
||||
size_t len = min(count, p->iov_len - skip);
|
||||
size_t ret;
|
||||
|
||||
if (unlikely(!len))
|
||||
continue;
|
||||
err = fault_in_pages_readable(p->iov_base + skip, len);
|
||||
if (unlikely(err))
|
||||
return err;
|
||||
bytes -= len;
|
||||
ret = fault_in_readable(p->iov_base + skip, len);
|
||||
count -= len - ret;
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
return count + size;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(iov_iter_fault_in_readable);
|
||||
EXPORT_SYMBOL(fault_in_iov_iter_readable);
|
||||
|
||||
/*
|
||||
* fault_in_iov_iter_writeable - fault in iov iterator for writing
|
||||
* @i: iterator
|
||||
* @size: maximum length
|
||||
*
|
||||
* Faults in the iterator using get_user_pages(), i.e., without triggering
|
||||
* hardware page faults. This is primarily useful when we already know that
|
||||
* some or all of the pages in @i aren't in memory.
|
||||
*
|
||||
* Returns the number of bytes not faulted in, like copy_to_user() and
|
||||
* copy_from_user().
|
||||
*
|
||||
* Always returns 0 for non-user-space iterators.
|
||||
*/
|
||||
size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size)
|
||||
{
|
||||
if (iter_is_iovec(i)) {
|
||||
size_t count = min(size, iov_iter_count(i));
|
||||
const struct iovec *p;
|
||||
size_t skip;
|
||||
|
||||
size -= count;
|
||||
for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) {
|
||||
size_t len = min(count, p->iov_len - skip);
|
||||
size_t ret;
|
||||
|
||||
if (unlikely(!len))
|
||||
continue;
|
||||
ret = fault_in_safe_writeable(p->iov_base + skip, len);
|
||||
count -= len - ret;
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
return count + size;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(fault_in_iov_iter_writeable);
|
||||
|
||||
void iov_iter_init(struct iov_iter *i, unsigned int direction,
|
||||
const struct iovec *iov, unsigned long nr_segs,
|
||||
@@ -468,6 +514,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
|
||||
WARN_ON(direction & ~(READ | WRITE));
|
||||
*i = (struct iov_iter) {
|
||||
.iter_type = ITER_IOVEC,
|
||||
.nofault = false,
|
||||
.data_source = direction,
|
||||
.iov = iov,
|
||||
.nr_segs = nr_segs,
|
||||
@@ -1483,13 +1530,17 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
|
||||
return 0;
|
||||
|
||||
if (likely(iter_is_iovec(i))) {
|
||||
unsigned int gup_flags = 0;
|
||||
unsigned long addr;
|
||||
|
||||
if (iov_iter_rw(i) != WRITE)
|
||||
gup_flags |= FOLL_WRITE;
|
||||
if (i->nofault)
|
||||
gup_flags |= FOLL_NOFAULT;
|
||||
|
||||
addr = first_iovec_segment(i, &len, start, maxsize, maxpages);
|
||||
n = DIV_ROUND_UP(len, PAGE_SIZE);
|
||||
res = get_user_pages_fast(addr, n,
|
||||
iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0,
|
||||
pages);
|
||||
res = get_user_pages_fast(addr, n, gup_flags, pages);
|
||||
if (unlikely(res <= 0))
|
||||
return res;
|
||||
return (res == n ? len : res * PAGE_SIZE) - *start;
|
||||
@@ -1605,15 +1656,20 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
|
||||
return 0;
|
||||
|
||||
if (likely(iter_is_iovec(i))) {
|
||||
unsigned int gup_flags = 0;
|
||||
unsigned long addr;
|
||||
|
||||
if (iov_iter_rw(i) != WRITE)
|
||||
gup_flags |= FOLL_WRITE;
|
||||
if (i->nofault)
|
||||
gup_flags |= FOLL_NOFAULT;
|
||||
|
||||
addr = first_iovec_segment(i, &len, start, maxsize, ~0U);
|
||||
n = DIV_ROUND_UP(len, PAGE_SIZE);
|
||||
p = get_pages_array(n);
|
||||
if (!p)
|
||||
return -ENOMEM;
|
||||
res = get_user_pages_fast(addr, n,
|
||||
iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, p);
|
||||
res = get_user_pages_fast(addr, n, gup_flags, p);
|
||||
if (unlikely(res <= 0)) {
|
||||
kvfree(p);
|
||||
*pages = NULL;
|
||||
|
||||
@@ -90,7 +90,7 @@
|
||||
* ->lock_page (filemap_fault, access_process_vm)
|
||||
*
|
||||
* ->i_rwsem (generic_perform_write)
|
||||
* ->mmap_lock (fault_in_pages_readable->do_page_fault)
|
||||
* ->mmap_lock (fault_in_readable->do_page_fault)
|
||||
*
|
||||
* bdi->wb.list_lock
|
||||
* sb_lock (fs/fs-writeback.c)
|
||||
@@ -3810,7 +3810,7 @@ again:
|
||||
* same page as we're writing to, without it being marked
|
||||
* up-to-date.
|
||||
*/
|
||||
if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
|
||||
if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
|
||||
status = -EFAULT;
|
||||
break;
|
||||
}
|
||||
|
||||
120
mm/gup.c
120
mm/gup.c
@@ -943,6 +943,8 @@ static int faultin_page(struct vm_area_struct *vma,
|
||||
/* mlock all present pages, but do not fault in new pages */
|
||||
if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
|
||||
return -ENOENT;
|
||||
if (*flags & FOLL_NOFAULT)
|
||||
return -EFAULT;
|
||||
if (*flags & FOLL_WRITE)
|
||||
fault_flags |= FAULT_FLAG_WRITE;
|
||||
if (*flags & FOLL_REMOTE)
|
||||
@@ -1681,6 +1683,122 @@ finish_or_fault:
|
||||
}
|
||||
#endif /* !CONFIG_MMU */
|
||||
|
||||
/**
|
||||
* fault_in_writeable - fault in userspace address range for writing
|
||||
* @uaddr: start of address range
|
||||
* @size: size of address range
|
||||
*
|
||||
* Returns the number of bytes not faulted in (like copy_to_user() and
|
||||
* copy_from_user()).
|
||||
*/
|
||||
size_t fault_in_writeable(char __user *uaddr, size_t size)
|
||||
{
|
||||
char __user *start = uaddr, *end;
|
||||
|
||||
if (unlikely(size == 0))
|
||||
return 0;
|
||||
if (!PAGE_ALIGNED(uaddr)) {
|
||||
if (unlikely(__put_user(0, uaddr) != 0))
|
||||
return size;
|
||||
uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
|
||||
}
|
||||
end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
|
||||
if (unlikely(end < start))
|
||||
end = NULL;
|
||||
while (uaddr != end) {
|
||||
if (unlikely(__put_user(0, uaddr) != 0))
|
||||
goto out;
|
||||
uaddr += PAGE_SIZE;
|
||||
}
|
||||
|
||||
out:
|
||||
if (size > uaddr - start)
|
||||
return size - (uaddr - start);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(fault_in_writeable);
|
||||
|
||||
/*
|
||||
* fault_in_safe_writeable - fault in an address range for writing
|
||||
* @uaddr: start of address range
|
||||
* @size: length of address range
|
||||
*
|
||||
* Faults in an address range for writing. This is primarily useful when we
|
||||
* already know that some or all of the pages in the address range aren't in
|
||||
* memory.
|
||||
*
|
||||
* Unlike fault_in_writeable(), this function is non-destructive.
|
||||
*
|
||||
* Note that we don't pin or otherwise hold the pages referenced that we fault
|
||||
* in. There's no guarantee that they'll stay in memory for any duration of
|
||||
* time.
|
||||
*
|
||||
* Returns the number of bytes not faulted in, like copy_to_user() and
|
||||
* copy_from_user().
|
||||
*/
|
||||
size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
|
||||
{
|
||||
unsigned long start = (unsigned long)uaddr, end;
|
||||
struct mm_struct *mm = current->mm;
|
||||
bool unlocked = false;
|
||||
|
||||
if (unlikely(size == 0))
|
||||
return 0;
|
||||
end = PAGE_ALIGN(start + size);
|
||||
if (end < start)
|
||||
end = 0;
|
||||
|
||||
mmap_read_lock(mm);
|
||||
do {
|
||||
if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked))
|
||||
break;
|
||||
start = (start + PAGE_SIZE) & PAGE_MASK;
|
||||
} while (start != end);
|
||||
mmap_read_unlock(mm);
|
||||
|
||||
if (size > (unsigned long)uaddr - start)
|
||||
return size - ((unsigned long)uaddr - start);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(fault_in_safe_writeable);
|
||||
|
||||
/**
|
||||
* fault_in_readable - fault in userspace address range for reading
|
||||
* @uaddr: start of user address range
|
||||
* @size: size of user address range
|
||||
*
|
||||
* Returns the number of bytes not faulted in (like copy_to_user() and
|
||||
* copy_from_user()).
|
||||
*/
|
||||
size_t fault_in_readable(const char __user *uaddr, size_t size)
|
||||
{
|
||||
const char __user *start = uaddr, *end;
|
||||
volatile char c;
|
||||
|
||||
if (unlikely(size == 0))
|
||||
return 0;
|
||||
if (!PAGE_ALIGNED(uaddr)) {
|
||||
if (unlikely(__get_user(c, uaddr) != 0))
|
||||
return size;
|
||||
uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
|
||||
}
|
||||
end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
|
||||
if (unlikely(end < start))
|
||||
end = NULL;
|
||||
while (uaddr != end) {
|
||||
if (unlikely(__get_user(c, uaddr) != 0))
|
||||
goto out;
|
||||
uaddr += PAGE_SIZE;
|
||||
}
|
||||
|
||||
out:
|
||||
(void)c;
|
||||
if (size > uaddr - start)
|
||||
return size - (uaddr - start);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(fault_in_readable);
|
||||
|
||||
/**
|
||||
* get_dump_page() - pin user page in memory while writing it to core dump
|
||||
* @addr: user address
|
||||
@@ -2733,7 +2851,7 @@ static int internal_get_user_pages_fast(unsigned long start,
|
||||
|
||||
if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
|
||||
FOLL_FORCE | FOLL_PIN | FOLL_GET |
|
||||
FOLL_FAST_ONLY)))
|
||||
FOLL_FAST_ONLY | FOLL_NOFAULT)))
|
||||
return -EINVAL;
|
||||
|
||||
if (gup_flags & FOLL_PIN)
|
||||
|
||||
@@ -528,6 +528,8 @@ static bool __init kfence_init_pool(void)
|
||||
* enters __slab_free() slow-path.
|
||||
*/
|
||||
for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
|
||||
struct page *page = &pages[i];
|
||||
|
||||
if (!i || (i % 2))
|
||||
continue;
|
||||
|
||||
@@ -535,7 +537,11 @@ static bool __init kfence_init_pool(void)
|
||||
if (WARN_ON(compound_head(&pages[i]) != &pages[i]))
|
||||
goto err;
|
||||
|
||||
__SetPageSlab(&pages[i]);
|
||||
__SetPageSlab(page);
|
||||
#ifdef CONFIG_MEMCG
|
||||
page->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg |
|
||||
MEMCG_DATA_OBJCGS;
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -911,6 +917,9 @@ void __kfence_free(void *addr)
|
||||
{
|
||||
struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
KFENCE_WARN_ON(meta->objcg);
|
||||
#endif
|
||||
/*
|
||||
* If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing
|
||||
* the object, as the object page may be recycled for other-typed
|
||||
|
||||
@@ -89,6 +89,9 @@ struct kfence_metadata {
|
||||
struct kfence_track free_track;
|
||||
/* For updating alloc_covered on frees. */
|
||||
u32 alloc_stack_hash;
|
||||
#ifdef CONFIG_MEMCG
|
||||
struct obj_cgroup *objcg;
|
||||
#endif
|
||||
};
|
||||
|
||||
extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
|
||||
|
||||
@@ -929,7 +929,7 @@ static struct bpf_iter_reg bpf_sk_storage_map_reg_info = {
|
||||
{ offsetof(struct bpf_iter__bpf_sk_storage_map, sk),
|
||||
PTR_TO_BTF_ID_OR_NULL },
|
||||
{ offsetof(struct bpf_iter__bpf_sk_storage_map, value),
|
||||
PTR_TO_RDWR_BUF_OR_NULL },
|
||||
PTR_TO_BUF | PTR_MAYBE_NULL },
|
||||
},
|
||||
.seq_info = &iter_seq_info,
|
||||
};
|
||||
|
||||
@@ -1713,7 +1713,7 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_ANYTHING,
|
||||
.arg3_type = ARG_PTR_TO_MEM,
|
||||
.arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg4_type = ARG_CONST_SIZE,
|
||||
.arg5_type = ARG_ANYTHING,
|
||||
};
|
||||
@@ -2018,9 +2018,9 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
|
||||
.gpl_only = false,
|
||||
.pkt_access = true,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_MEM_OR_NULL,
|
||||
.arg1_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
|
||||
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
.arg3_type = ARG_PTR_TO_MEM_OR_NULL,
|
||||
.arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
|
||||
.arg4_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
.arg5_type = ARG_ANYTHING,
|
||||
};
|
||||
@@ -2541,7 +2541,7 @@ static const struct bpf_func_proto bpf_redirect_neigh_proto = {
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_ANYTHING,
|
||||
.arg2_type = ARG_PTR_TO_MEM_OR_NULL,
|
||||
.arg2_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
};
|
||||
@@ -4177,7 +4177,7 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = {
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_CONST_MAP_PTR,
|
||||
.arg3_type = ARG_ANYTHING,
|
||||
.arg4_type = ARG_PTR_TO_MEM,
|
||||
.arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
};
|
||||
|
||||
@@ -4191,7 +4191,7 @@ const struct bpf_func_proto bpf_skb_output_proto = {
|
||||
.arg1_btf_id = &bpf_skb_output_btf_ids[0],
|
||||
.arg2_type = ARG_CONST_MAP_PTR,
|
||||
.arg3_type = ARG_ANYTHING,
|
||||
.arg4_type = ARG_PTR_TO_MEM,
|
||||
.arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
};
|
||||
|
||||
@@ -4374,7 +4374,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
};
|
||||
@@ -4400,7 +4400,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE,
|
||||
};
|
||||
|
||||
@@ -4570,7 +4570,7 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = {
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_CONST_MAP_PTR,
|
||||
.arg3_type = ARG_ANYTHING,
|
||||
.arg4_type = ARG_PTR_TO_MEM,
|
||||
.arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
};
|
||||
|
||||
@@ -4584,7 +4584,7 @@ const struct bpf_func_proto bpf_xdp_output_proto = {
|
||||
.arg1_btf_id = &bpf_xdp_output_btf_ids[0],
|
||||
.arg2_type = ARG_CONST_MAP_PTR,
|
||||
.arg3_type = ARG_ANYTHING,
|
||||
.arg4_type = ARG_PTR_TO_MEM,
|
||||
.arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
};
|
||||
|
||||
@@ -5072,7 +5072,7 @@ const struct bpf_func_proto bpf_sk_setsockopt_proto = {
|
||||
.arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
|
||||
.arg2_type = ARG_ANYTHING,
|
||||
.arg3_type = ARG_ANYTHING,
|
||||
.arg4_type = ARG_PTR_TO_MEM,
|
||||
.arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg5_type = ARG_CONST_SIZE,
|
||||
};
|
||||
|
||||
@@ -5106,7 +5106,7 @@ static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = {
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_ANYTHING,
|
||||
.arg3_type = ARG_ANYTHING,
|
||||
.arg4_type = ARG_PTR_TO_MEM,
|
||||
.arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg5_type = ARG_CONST_SIZE,
|
||||
};
|
||||
|
||||
@@ -5140,7 +5140,7 @@ static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = {
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_ANYTHING,
|
||||
.arg3_type = ARG_ANYTHING,
|
||||
.arg4_type = ARG_PTR_TO_MEM,
|
||||
.arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg5_type = ARG_CONST_SIZE,
|
||||
};
|
||||
|
||||
@@ -5315,7 +5315,7 @@ static const struct bpf_func_proto bpf_bind_proto = {
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE,
|
||||
};
|
||||
|
||||
@@ -5903,7 +5903,7 @@ static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = {
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_ANYTHING,
|
||||
.arg3_type = ARG_PTR_TO_MEM,
|
||||
.arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg4_type = ARG_CONST_SIZE
|
||||
};
|
||||
|
||||
@@ -5913,7 +5913,7 @@ static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = {
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_ANYTHING,
|
||||
.arg3_type = ARG_PTR_TO_MEM,
|
||||
.arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg4_type = ARG_CONST_SIZE
|
||||
};
|
||||
|
||||
@@ -5956,7 +5956,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_ANYTHING,
|
||||
.arg3_type = ARG_PTR_TO_MEM,
|
||||
.arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg4_type = ARG_CONST_SIZE
|
||||
};
|
||||
|
||||
@@ -6044,7 +6044,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_ANYTHING,
|
||||
.arg3_type = ARG_PTR_TO_MEM,
|
||||
.arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg4_type = ARG_CONST_SIZE
|
||||
};
|
||||
|
||||
@@ -6269,7 +6269,7 @@ static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = {
|
||||
.pkt_access = true,
|
||||
.ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
.arg5_type = ARG_ANYTHING,
|
||||
@@ -6288,7 +6288,7 @@ static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
|
||||
.pkt_access = true,
|
||||
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
.arg5_type = ARG_ANYTHING,
|
||||
@@ -6307,7 +6307,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
|
||||
.pkt_access = true,
|
||||
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
.arg5_type = ARG_ANYTHING,
|
||||
@@ -6344,7 +6344,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = {
|
||||
.pkt_access = true,
|
||||
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
.arg5_type = ARG_ANYTHING,
|
||||
@@ -6367,7 +6367,7 @@ static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = {
|
||||
.pkt_access = true,
|
||||
.ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
.arg5_type = ARG_ANYTHING,
|
||||
@@ -6390,7 +6390,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = {
|
||||
.pkt_access = true,
|
||||
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
.arg5_type = ARG_ANYTHING,
|
||||
@@ -6409,7 +6409,7 @@ static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = {
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
.arg5_type = ARG_ANYTHING,
|
||||
@@ -6428,7 +6428,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = {
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
.arg5_type = ARG_ANYTHING,
|
||||
@@ -6447,7 +6447,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
.arg5_type = ARG_ANYTHING,
|
||||
@@ -6769,9 +6769,9 @@ static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = {
|
||||
.pkt_access = true,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE,
|
||||
.arg4_type = ARG_PTR_TO_MEM,
|
||||
.arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg5_type = ARG_CONST_SIZE,
|
||||
};
|
||||
|
||||
@@ -6838,9 +6838,9 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = {
|
||||
.pkt_access = true,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE,
|
||||
.arg4_type = ARG_PTR_TO_MEM,
|
||||
.arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg5_type = ARG_CONST_SIZE,
|
||||
};
|
||||
|
||||
@@ -7069,7 +7069,7 @@ static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = {
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
};
|
||||
|
||||
@@ -1575,7 +1575,7 @@ static struct bpf_iter_reg sock_map_iter_reg = {
|
||||
.ctx_arg_info_size = 2,
|
||||
.ctx_arg_info = {
|
||||
{ offsetof(struct bpf_iter__sockmap, key),
|
||||
PTR_TO_RDONLY_BUF_OR_NULL },
|
||||
PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY },
|
||||
{ offsetof(struct bpf_iter__sockmap, sk),
|
||||
PTR_TO_BTF_ID_OR_NULL },
|
||||
},
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include "test_ksyms_btf.skel.h"
|
||||
#include "test_ksyms_btf_null_check.skel.h"
|
||||
#include "test_ksyms_weak.skel.h"
|
||||
#include "test_ksyms_btf_write_check.skel.h"
|
||||
|
||||
static int duration;
|
||||
|
||||
@@ -109,6 +110,16 @@ cleanup:
|
||||
test_ksyms_weak__destroy(skel);
|
||||
}
|
||||
|
||||
static void test_write_check(void)
|
||||
{
|
||||
struct test_ksyms_btf_write_check *skel;
|
||||
|
||||
skel = test_ksyms_btf_write_check__open_and_load();
|
||||
ASSERT_ERR_PTR(skel, "unexpected load of a prog writing to ksym memory\n");
|
||||
|
||||
test_ksyms_btf_write_check__destroy(skel);
|
||||
}
|
||||
|
||||
void test_ksyms_btf(void)
|
||||
{
|
||||
int percpu_datasec;
|
||||
@@ -136,4 +147,7 @@ void test_ksyms_btf(void)
|
||||
|
||||
if (test__start_subtest("weak_ksyms"))
|
||||
test_weak_syms();
|
||||
|
||||
if (test__start_subtest("write_check"))
|
||||
test_write_check();
|
||||
}
|
||||
|
||||
@@ -0,0 +1,29 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/* Copyright (c) 2021 Google */
|
||||
|
||||
#include "vmlinux.h"
|
||||
|
||||
#include <bpf/bpf_helpers.h>
|
||||
|
||||
extern const int bpf_prog_active __ksym; /* int type global var. */
|
||||
|
||||
SEC("raw_tp/sys_enter")
|
||||
int handler(const void *ctx)
|
||||
{
|
||||
int *active;
|
||||
__u32 cpu;
|
||||
|
||||
cpu = bpf_get_smp_processor_id();
|
||||
active = (int *)bpf_per_cpu_ptr(&bpf_prog_active, cpu);
|
||||
if (active) {
|
||||
/* Kernel memory obtained from bpf_{per,this}_cpu_ptr
|
||||
* is read-only, should _not_ pass verification.
|
||||
*/
|
||||
/* WRITE_ONCE */
|
||||
*(volatile int *)active = -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
char _license[] SEC("license") = "GPL";
|
||||
@@ -107,6 +107,25 @@
|
||||
.result = REJECT,
|
||||
.errstr = "R0 min value is outside of the allowed memory range",
|
||||
},
|
||||
{
|
||||
"calls: trigger reg2btf_ids[reg->type] for reg->type > __BPF_REG_TYPE_MAX",
|
||||
.insns = {
|
||||
BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
|
||||
BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
|
||||
BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 0),
|
||||
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
|
||||
BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
|
||||
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
|
||||
BPF_EXIT_INSN(),
|
||||
},
|
||||
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
|
||||
.result = REJECT,
|
||||
.errstr = "arg#0 pointer type STRUCT prog_test_ref_kfunc must point",
|
||||
.fixup_kfunc_btf_id = {
|
||||
{ "bpf_kfunc_call_test_acquire", 3 },
|
||||
{ "bpf_kfunc_call_test_release", 5 },
|
||||
},
|
||||
},
|
||||
{
|
||||
"calls: overlapping caller/callee",
|
||||
.insns = {
|
||||
|
||||
Reference in New Issue
Block a user