mirror of
https://github.com/hardkernel/linux.git
synced 2026-06-05 02:21:52 +09:00
Changes in 6.1.141 gpio: pca953x: Add missing header(s) gpio: pca953x: Split pca953x_restore_context() and pca953x_save_context() gpio: pca953x: Simplify code with cleanup helpers gpio: pca953x: fix IRQ storm on system wake up phy: renesas: rcar-gen3-usb2: Add support to initialize the bus phy: renesas: rcar-gen3-usb2: Move IRQ request in probe phy: renesas: rcar-gen3-usb2: Lock around hardware registers and driver data phy: renesas: rcar-gen3-usb2: Assert PLL reset on PHY power off scsi: target: iscsi: Fix timeout on deleted connection virtio_ring: Fix data race by tagging event_triggered as racy for KCSAN dma-mapping: avoid potential unused data compilation warning cgroup: Fix compilation issue due to cgroup_mutex not being exported scsi: mpi3mr: Add level check to control event logging net: enetc: refactor bulk flipping of RX buffers to separate function drm/amdgpu: Allow P2P access through XGMI selftests/bpf: Mitigate sockmap_ktls disconnect_after_delete failure bpf: fix possible endless loop in BPF map iteration samples/bpf: Fix compilation failure for samples/bpf on LoongArch Fedora kconfig: merge_config: use an empty file as initfile s390/vfio-ap: Fix no AP queue sharing allowed message written to kernel log cifs: Add fallback for SMB2 CREATE without FILE_READ_ATTRIBUTES cifs: Fix querying and creating MF symlinks over SMB1 cifs: Fix negotiate retry functionality fuse: Return EPERM rather than ENOSYS from link() NFSv4: Check for delegation validity in nfs_start_delegation_return_locked() NFS: Don't allow waiting for exiting tasks SUNRPC: Don't allow waiting for exiting tasks arm64: Add support for HIP09 Spectre-BHB mitigation tracing: Mark binary printing functions with __printf() attribute mailbox: use error ret code of of_parse_phandle_with_args() fbdev: fsl-diu-fb: add missing device_remove_file() fbcon: Use correct erase colour for clearing in fbcon fbdev: core: tileblit: Implement missing margin clearing for tileblit cifs: Fix establishing NetBIOS session for SMB2+ connection NFSv4: Treat ENETUNREACH errors as fatal for state recovery SUNRPC: rpc_clnt_set_transport() must not change the autobind setting SUNRPC: rpcbind should never reset the port to the value '0' thermal/drivers/qoriq: Power down TMU on system suspend dql: Fix dql->limit value when reset. lockdep: Fix wait context check on softirq for PREEMPT_RT objtool: Properly disable uaccess validation PCI: dwc: ep: Ensure proper iteration over outbound map windows tools/build: Don't pass test log files to linker pNFS/flexfiles: Report ENETDOWN as a connection error PCI: vmd: Disable MSI remapping bypass under Xen libnvdimm/labels: Fix divide error in nd_label_data_init() mmc: host: Wait for Vdd to settle on card power off x86/mm: Check return value from memblock_phys_alloc_range() i2c: qup: Vote for interconnect bandwidth to DRAM i2c: pxa: fix call balance of i2c->clk handling routines btrfs: make btrfs_discard_workfn() block_group ref explicit btrfs: avoid linker error in btrfs_find_create_tree_block() btrfs: run btrfs_error_commit_super() early btrfs: fix non-empty delayed iputs list on unmount due to async workers btrfs: get zone unusable bytes while holding lock at btrfs_reclaim_bgs_work() btrfs: send: return -ENAMETOOLONG when attempting a path that is too long drm/amd/display: Guard against setting dispclk low for dcn31x i3c: master: svc: Fix missing STOP for master request dlm: make tcp still work in multi-link env um: Store full CSGSFS and SS register from mcontext um: Update min_low_pfn to match changes in uml_reserved ext4: reorder capability check last scsi: st: Tighten the page format heuristics with MODE SELECT scsi: st: ERASE does not change tape location vfio/pci: Handle INTx IRQ_NOTCONNECTED bpf: Return prog btf_id without capable check tcp: reorganize tcp_in_ack_event() and tcp_count_delivered() rtc: rv3032: fix EERD location thunderbolt: Do not add non-active NVM if NVM upgrade is disabled for retimer ASoC: mediatek: mt6359: Add stub for mt6359_accdet_enable_jack_detect kbuild: fix argument parsing in scripts/config crypto: octeontx2 - suppress auth failure screaming due to negative tests dm: restrict dm device size to 2^63-512 bytes net/smc: use the correct ndev to find pnetid by pnetid table xen: Add support for XenServer 6.1 platform device pinctrl-tegra: Restore SFSEL bit when freeing pins ASoC: sun4i-codec: support hp-det-gpios property ext4: reject the 'data_err=abort' option in nojournal mode RDMA/uverbs: Propagate errors from rdma_lookup_get_uobject() posix-timers: Add cond_resched() to posix_timer_add() search loop timer_list: Don't use %pK through printk() netfilter: conntrack: Bound nf_conntrack sysctl writes arm64/mm: Check PUD_TYPE_TABLE in pud_bad() mmc: dw_mmc: add exynos7870 DW MMC support mmc: sdhci: Disable SD card clock before changing parameters hwmon: (dell-smm) Increment the number of fans ipv6: save dontfrag in cork drm/amd/display: calculate the remain segments for all pipes gfs2: Check for empty queue in run_queue auxdisplay: charlcd: Partially revert "Move hwidth and bwidth to struct hd44780_common" ASoC: qcom: sm8250: explicitly set format in sm8250_be_hw_params_fixup() iommu/amd/pgtbl_v2: Improve error handling cpufreq: tegra186: Share policy per cluster crypto: lzo - Fix compression buffer overrun arm64: tegra: p2597: Fix gpio for vdd-1v8-dis regulator powerpc/prom_init: Fixup missing #size-cells on PowerBook6,7 ALSA: seq: Improve data consistency at polling tcp: bring back NUMA dispersion in inet_ehash_locks_alloc() rtc: ds1307: stop disabling alarms on probe ieee802154: ca8210: Use proper setters and getters for bitwise types ARM: tegra: Switch DSI-B clock parent to PLLD on Tegra114 media: c8sectpfe: Call of_node_put(i2c_bus) only once in c8sectpfe_probe() dm cache: prevent BUG_ON by blocking retries on failed device resumes orangefs: Do not truncate file size net: phylink: use pl->link_interface in phylink_expects_phy() remoteproc: qcom_wcnss: Handle platforms with only single power domain drm/amdgpu: Do not program AGP BAR regs under SRIOV in gfxhub_v1_0.c media: cx231xx: set device_caps for 417 pinctrl: bcm281xx: Use "unsigned int" instead of bare "unsigned" net: ethernet: ti: cpsw_new: populate netdev of_node net: pktgen: fix mpls maximum labels list parsing perf/hw_breakpoint: Return EOPNOTSUPP for unsupported breakpoint type ALSA: hda/realtek: Enable PC beep passthrough for HP EliteBook 855 G7 ipv4: fib: Move fib_valid_key_len() to rtm_to_fib_config(). drm/rockchip: vop2: Add uv swap for cluster window media: uvcvideo: Add sanity check to uvc_ioctl_xu_ctrl_map clk: imx8mp: inform CCF of maximum frequency of clocks x86/bugs: Make spectre user default depend on MITIGATION_SPECTRE_V2 hwmon: (gpio-fan) Add missing mutex locks ARM: at91: pm: fix at91_suspend_finish for ZQ calibration drm/mediatek: mtk_dpi: Add checks for reg_h_fre_con existence fpga: altera-cvp: Increase credit timeout soc: apple: rtkit: Use high prio work queue soc: apple: rtkit: Implement OSLog buffers properly PCI: brcmstb: Expand inbound window size up to 64GB PCI: brcmstb: Add a softdep to MIP MSI-X driver firmware: arm_ffa: Set dma_mask for ffa devices net/mlx5: Avoid report two health errors on same syndrome selftests/net: have `gro.sh -t` return a correct exit code drm/amdkfd: KFD release_work possible circular locking leds: pwm-multicolor: Add check for fwnode_property_read_u32 net: ethernet: mtk_ppe_offload: Allow QinQ, double ETH_P_8021Q only net: xgene-v2: remove incorrect ACPI_PTR annotation bonding: report duplicate MAC address in all situations soc: ti: k3-socinfo: Do not use syscon helper to build regmap x86/build: Fix broken copy command in genimage.sh when making isoimage drm/amd/display: handle max_downscale_src_width fail check x86/nmi: Add an emergency handler in nmi_desc & use it in nmi_shootdown_cpus() cpuidle: menu: Avoid discarding useful information media: adv7180: Disable test-pattern control on adv7180 libbpf: Fix out-of-bound read dm: fix unconditional IO throttle caused by REQ_PREFLUSH x86/kaslr: Reduce KASLR entropy on most x86 systems MIPS: Use arch specific syscall name match function genirq/msi: Store the IOMMU IOVA directly in msi_desc instead of iommu_cookie MIPS: pm-cps: Use per-CPU variables as per-CPU, not per-core clocksource: mips-gic-timer: Enable counter when CPUs start scsi: mpt3sas: Send a diag reset if target reset fails wifi: rtw88: Fix rtw_init_vht_cap() for RTL8814AU wifi: rtw88: Fix rtw_init_ht_cap() for RTL8814AU wifi: rtw88: Fix rtw_desc_to_mcsrate() to handle MCS16-31 wifi: rtw89: fw: propagate error code from rtw89_h2c_tx() net: pktgen: fix access outside of user given buffer in pktgen_thread_write() EDAC/ie31200: work around false positive build warning i3c: master: svc: Flush FIFO before sending Dynamic Address Assignment(DAA) serial: mctrl_gpio: split disable_ms into sync and no_sync APIs RDMA/core: Fix best page size finding when it can cross SG entries pmdomain: imx: gpcv2: use proper helper for property detection can: c_can: Use of_property_present() to test existence of DT property eth: mlx4: don't try to complete XDP frames in netpoll PCI: Fix old_size lower bound in calculate_iosize() too ACPI: HED: Always initialize before evged vxlan: Join / leave MC group after remote changes media: test-drivers: vivid: don't call schedule in loop net/mlx5: Modify LSB bitmask in temperature event to include only the first bit net/mlx5: Apply rate-limiting to high temperature warning ASoC: ops: Enforce platform maximum on initial value ASoC: tas2764: Add reg defaults for TAS2764_INT_CLK_CFG ASoC: tas2764: Mark SW_RESET as volatile ASoC: tas2764: Power up/down amp on mute ops ASoC: soc-dai: check return value at snd_soc_dai_set_tdm_slot() pinctrl: devicetree: do not goto err when probing hogs in pinctrl_dt_to_map smack: recognize ipv4 CIPSO w/o categories kunit: tool: Use qboot on QEMU x86_64 net/mlx4_core: Avoid impossible mlx4_db_alloc() order value clk: qcom: clk-alpha-pll: Do not use random stack value for recalc rate serial: sh-sci: Update the suspend/resume support phy: core: don't require set_mode() callback for phy_get_mode() to work drm/amdgpu: reset psp->cmd to NULL after releasing the buffer drm/amd/display: Initial psr_version with correct setting drm/amdgpu: enlarge the VBIOS binary size limit drm/amd/display/dm: drop hw_support check in amdgpu_dm_i2c_xfer() net/mlx5: Extend Ethtool loopback selftest to support non-linear SKB net/mlx5e: set the tx_queue_len for pfifo_fast net/mlx5e: reduce rep rxq depth to 256 for ECPF wifi: mac80211: don't unconditionally call drv_mgd_complete_tx() wifi: mac80211: remove misplaced drv_mgd_complete_tx() call arch/powerpc/perf: Check the instruction type before creating sample with perf_mem_data_src ip: fib_rules: Fetch net from fib_rule in fib[46]_rule_configure(). r8152: add vendor/device ID pair for Dell Alienware AW1022z wifi: rtw88: Fix download_firmware_validate() for RTL8814AU clk: qcom: camcc-sm8250: Use clk_rcg2_shared_ops for some RCGs hwmon: (xgene-hwmon) use appropriate type for the latency value media: qcom: camss: csid: Only add TPG v4l2 ctrl if TPG hardware is available vxlan: Annotate FDB data races r8169: don't scan PHY addresses > 0 rcu: handle quiescent states for PREEMPT_RCU=n, PREEMPT_COUNT=y rcu: handle unstable rdp in rcu_read_unlock_strict() rcu: fix header guard for rcu_all_qs() perf: Avoid the read if the count is already updated ice: count combined queues using Rx/Tx count net/mana: fix warning in the writer of client oob scsi: lpfc: Handle duplicate D_IDs in ndlp search-by D_ID routine scsi: lpfc: Free phba irq in lpfc_sli4_enable_msi() when pci_irq_vector() fails scsi: st: Restore some drive settings after reset HID: usbkbd: Fix the bit shift number for LED_KANA ASoC: codecs: pcm3168a: Allow for 24-bit in provider mode drm/ast: Find VBIOS mode from regular display size bpftool: Fix readlink usage in get_fd_type perf/amd/ibs: Fix perf_ibs_op.cnt_mask for CurCnt wifi: rtl8xxxu: retry firmware download on error wifi: rtw88: Don't use static local variable in rtw8822b_set_tx_power_index_by_rate wifi: rtw89: add wiphy_lock() to work that isn't held wiphy_lock() yet spi: zynqmp-gqspi: Always acknowledge interrupts regulator: ad5398: Add device tree support wifi: ath9k: return by of_get_mac_address drm/atomic: clarify the rules around drm_atomic_state->allow_modeset drm/panel-edp: Add Starry 116KHD024006 drm: Add valid clones check ASoC: imx-card: Adjust over allocation of memory in imx_card_parse_of() pinctrl: meson: define the pull up/down resistor value as 60 kOhm ASoC: Intel: bytcr_rt5640: Add DMI quirk for Acer Aspire SW3-013 ALSA: hda/realtek: Add quirk for HP Spectre x360 15-df1xxx nvmet-tcp: don't restore null sk_state_change io_uring/fdinfo: annotate racy sq/cq head/tail reads btrfs: correct the order of prelim_ref arguments in btrfs__prelim_ref wifi: iwlwifi: add support for Killer on MTL xenbus: Allow PVH dom0 a non-local xenstore __legitimize_mnt(): check for MNT_SYNC_UMOUNT should be under mount_lock espintcp: remove encap socket caching to avoid reference leak dmaengine: idxd: add per DSA wq workqueue for processing cr faults dmaengine: idxd: add idxd_copy_cr() to copy user completion record during page fault handling dmaengine: idxd: Fix allowing write() from different address spaces remoteproc: qcom_wcnss: Fix on platforms without fallback regulators clk: sunxi-ng: d1: Add missing divider for MMC mod clocks xfrm: Sanitize marks before insert dmaengine: idxd: Fix ->poll() return value Bluetooth: L2CAP: Fix not checking l2cap_chan security level bridge: netfilter: Fix forwarding of fragmented packets ice: fix vf->num_mac count with port representors net: dwmac-sun8i: Use parsed internal PHY address instead of 1 net: lan743x: Restore SGMII CTRL register on resume io_uring: fix overflow resched cqe reordering sch_hfsc: Fix qlen accounting bug when using peek in hfsc_enqueue() octeontx2-pf: Add support for page pool octeontx2-pf: Add AF_XDP non-zero copy support net/tipc: fix slab-use-after-free Read in tipc_aead_encrypt_done octeontx2-af: Set LMT_ENA bit for APR table entries octeontx2-af: Fix APR entry mapping based on APR_LMT_CFG crypto: algif_hash - fix double free in hash_accept padata: do not leak refcount in reorder_work can: slcan: allow reception of short error messages can: bcm: add locking for bcm_op runtime updates can: bcm: add missing rcu read protection for procfs content ALSA: pcm: Fix race of buffer access at PCM OSS layer ALSA: hda/realtek: Add quirk for Lenovo Yoga Pro 7 14ASP10 llc: fix data loss when reading from a socket in llc_ui_recvmsg() platform/x86: dell-wmi-sysman: Avoid buffer overflow in current_password_store() drm/edid: fixed the bug that hdr metadata was not reset smb: client: Fix use-after-free in cifs_fill_dirent smb: client: Reset all search buffer pointers when releasing buffer Revert "drm/amd: Keep display off while going into S4" memcg: always call cond_resched() after fn() mm/page_alloc.c: avoid infinite retries caused by cpuset race Revert "arm64: dts: allwinner: h6: Use RSB for AXP805 PMIC connection" ksmbd: fix stream write failure spi: spi-fsl-dspi: restrict register range for regmap access spi: spi-fsl-dspi: Halt the module after a new message transfer spi: spi-fsl-dspi: Reset SR flags before sending a new message kbuild: Disable -Wdefault-const-init-unsafe serial: sh-sci: Save and restore more registers pinctrl: tegra: Fix off by one in tegra_pinctrl_get_group() i3c: master: svc: Fix implicit fallthrough in svc_i3c_master_ibi_work() x86/mm/init: Handle the special case of device private pages in add_pages(), to not increase max_pfn and trigger dma_addressing_limited() bounce buffers bounce buffers dmaengine: idxd: Fix passing freed memory in idxd_cdev_open() octeontx2-pf: fix page_pool creation fail for rings > 32k octeontx2-pf: Fix page pool cache index corruption. octeontx2-pf: Fix page pool frag allocation warning hrtimers: Force migrate away hrtimers queued after CPUHP_AP_HRTIMERS_DYING btrfs: check folio mapping after unlock in relocate_one_folio() af_unix: Kconfig: make CONFIG_UNIX bool af_unix: Return struct unix_sock from unix_get_socket(). af_unix: Run GC on only one CPU. af_unix: Try to run GC async. af_unix: Replace BUG_ON() with WARN_ON_ONCE(). af_unix: Remove io_uring code for GC. af_unix: Remove CONFIG_UNIX_SCM. af_unix: Allocate struct unix_vertex for each inflight AF_UNIX fd. af_unix: Allocate struct unix_edge for each inflight AF_UNIX fd. af_unix: Link struct unix_edge when queuing skb. af_unix: Bulk update unix_tot_inflight/unix_inflight when queuing skb. af_unix: Iterate all vertices by DFS. af_unix: Detect Strongly Connected Components. af_unix: Save listener for embryo socket. af_unix: Fix up unix_edge.successor for embryo socket. af_unix: Save O(n) setup of Tarjan's algo. af_unix: Skip GC if no cycle exists. af_unix: Avoid Tarjan's algorithm if unnecessary. af_unix: Assign a unique index to SCC. af_unix: Detect dead SCC. af_unix: Replace garbage collection algorithm. af_unix: Remove lock dance in unix_peek_fds(). af_unix: Try not to hold unix_gc_lock during accept(). af_unix: Don't access successor in unix_del_edges() during GC. af_unix: Add dead flag to struct scm_fp_list. af_unix: Fix garbage collection of embryos carrying OOB with SCM_RIGHTS af_unix: Fix uninit-value in __unix_walk_scc() arm64: dts: qcom: sm8350: Fix typo in pil_camera_mem node net_sched: hfsc: Address reentrant enqueue adding class to eltree twice perf/arm-cmn: Fix REQ2/SNP2 mixup perf/arm-cmn: Initialise cmn->cpu earlier coredump: fix error handling for replace_fd() pid: add pidfd_prepare() fork: use pidfd_prepare() coredump: hand a pidfd to the usermode coredump helper HID: quirks: Add ADATA XPG alpha wireless mouse support nfs: don't share pNFS DS connections between net namespaces platform/x86: thinkpad_acpi: Support also NEC Lavie X1475JAS um: let 'make clean' properly clean underlying SUBARCH as well spi: spi-sun4i: fix early activation nvme-pci: add NVME_QUIRK_NO_DEEPEST_PS quirk for SOLIDIGM P44 Pro NFS: Avoid flushing data while holding directory locks in nfs_rename() platform/x86: fujitsu-laptop: Support Lifebook S2110 hotkeys platform/x86: thinkpad_acpi: Ignore battery threshold change event notification net: ethernet: ti: am65-cpsw: Lower random mac address error print to info Linux 6.1.141 Change-Id: I4b93f8e69385f2087bf71545f58ae6f5cee1c5ba Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
744 lines
18 KiB
C
744 lines
18 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* Generic pidhash and scalable, time-bounded PID allocator
|
|
*
|
|
* (C) 2002-2003 Nadia Yvette Chambers, IBM
|
|
* (C) 2004 Nadia Yvette Chambers, Oracle
|
|
* (C) 2002-2004 Ingo Molnar, Red Hat
|
|
*
|
|
* pid-structures are backing objects for tasks sharing a given ID to chain
|
|
* against. There is very little to them aside from hashing them and
|
|
* parking tasks using given ID's on a list.
|
|
*
|
|
* The hash is always changed with the tasklist_lock write-acquired,
|
|
* and the hash is only accessed with the tasklist_lock at least
|
|
* read-acquired, so there's no additional SMP locking needed here.
|
|
*
|
|
* We have a list of bitmap pages, which bitmaps represent the PID space.
|
|
* Allocating and freeing PIDs is completely lockless. The worst-case
|
|
* allocation scenario when all but one out of 1 million PIDs possible are
|
|
* allocated already: the scanning of 32 list entries and at most PAGE_SIZE
|
|
* bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
|
|
*
|
|
* Pid namespaces:
|
|
* (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
|
|
* (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
|
|
* Many thanks to Oleg Nesterov for comments and help
|
|
*
|
|
*/
|
|
|
|
#include <linux/mm.h>
|
|
#include <linux/export.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/init.h>
|
|
#include <linux/rculist.h>
|
|
#include <linux/memblock.h>
|
|
#include <linux/pid_namespace.h>
|
|
#include <linux/init_task.h>
|
|
#include <linux/syscalls.h>
|
|
#include <linux/proc_ns.h>
|
|
#include <linux/refcount.h>
|
|
#include <linux/anon_inodes.h>
|
|
#include <linux/sched/signal.h>
|
|
#include <linux/sched/task.h>
|
|
#include <linux/idr.h>
|
|
#include <net/sock.h>
|
|
#include <uapi/linux/pidfd.h>
|
|
|
|
struct pid init_struct_pid = {
|
|
.count = REFCOUNT_INIT(1),
|
|
.tasks = {
|
|
{ .first = NULL },
|
|
{ .first = NULL },
|
|
{ .first = NULL },
|
|
},
|
|
.level = 0,
|
|
.numbers = { {
|
|
.nr = 0,
|
|
.ns = &init_pid_ns,
|
|
}, }
|
|
};
|
|
|
|
int pid_max = PID_MAX_DEFAULT;
|
|
|
|
#define RESERVED_PIDS 300
|
|
|
|
int pid_max_min = RESERVED_PIDS + 1;
|
|
int pid_max_max = PID_MAX_LIMIT;
|
|
|
|
/*
|
|
* PID-map pages start out as NULL, they get allocated upon
|
|
* first use and are never deallocated. This way a low pid_max
|
|
* value does not cause lots of bitmaps to be allocated, but
|
|
* the scheme scales to up to 4 million PIDs, runtime.
|
|
*/
|
|
struct pid_namespace init_pid_ns = {
|
|
.ns.count = REFCOUNT_INIT(2),
|
|
.idr = IDR_INIT(init_pid_ns.idr),
|
|
.pid_allocated = PIDNS_ADDING,
|
|
.level = 0,
|
|
.child_reaper = &init_task,
|
|
.user_ns = &init_user_ns,
|
|
.ns.inum = PROC_PID_INIT_INO,
|
|
#ifdef CONFIG_PID_NS
|
|
.ns.ops = &pidns_operations,
|
|
#endif
|
|
};
|
|
EXPORT_SYMBOL_GPL(init_pid_ns);
|
|
|
|
/*
|
|
* Note: disable interrupts while the pidmap_lock is held as an
|
|
* interrupt might come in and do read_lock(&tasklist_lock).
|
|
*
|
|
* If we don't disable interrupts there is a nasty deadlock between
|
|
* detach_pid()->free_pid() and another cpu that does
|
|
* spin_lock(&pidmap_lock) followed by an interrupt routine that does
|
|
* read_lock(&tasklist_lock);
|
|
*
|
|
* After we clean up the tasklist_lock and know there are no
|
|
* irq handlers that take it we can leave the interrupts enabled.
|
|
* For now it is easier to be safe than to prove it can't happen.
|
|
*/
|
|
|
|
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
|
|
|
|
void put_pid(struct pid *pid)
|
|
{
|
|
struct pid_namespace *ns;
|
|
|
|
if (!pid)
|
|
return;
|
|
|
|
ns = pid->numbers[pid->level].ns;
|
|
if (refcount_dec_and_test(&pid->count)) {
|
|
kmem_cache_free(ns->pid_cachep, pid);
|
|
put_pid_ns(ns);
|
|
}
|
|
}
|
|
EXPORT_SYMBOL_GPL(put_pid);
|
|
|
|
static void delayed_put_pid(struct rcu_head *rhp)
|
|
{
|
|
struct pid *pid = container_of(rhp, struct pid, rcu);
|
|
put_pid(pid);
|
|
}
|
|
|
|
void free_pid(struct pid *pid)
|
|
{
|
|
/* We can be called with write_lock_irq(&tasklist_lock) held */
|
|
int i;
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&pidmap_lock, flags);
|
|
for (i = 0; i <= pid->level; i++) {
|
|
struct upid *upid = pid->numbers + i;
|
|
struct pid_namespace *ns = upid->ns;
|
|
switch (--ns->pid_allocated) {
|
|
case 2:
|
|
case 1:
|
|
/* When all that is left in the pid namespace
|
|
* is the reaper wake up the reaper. The reaper
|
|
* may be sleeping in zap_pid_ns_processes().
|
|
*/
|
|
wake_up_process(ns->child_reaper);
|
|
break;
|
|
case PIDNS_ADDING:
|
|
/* Handle a fork failure of the first process */
|
|
WARN_ON(ns->child_reaper);
|
|
ns->pid_allocated = 0;
|
|
break;
|
|
}
|
|
|
|
idr_remove(&ns->idr, upid->nr);
|
|
}
|
|
spin_unlock_irqrestore(&pidmap_lock, flags);
|
|
|
|
call_rcu(&pid->rcu, delayed_put_pid);
|
|
}
|
|
|
|
struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
|
|
size_t set_tid_size)
|
|
{
|
|
struct pid *pid;
|
|
enum pid_type type;
|
|
int i, nr;
|
|
struct pid_namespace *tmp;
|
|
struct upid *upid;
|
|
int retval = -ENOMEM;
|
|
|
|
/*
|
|
* set_tid_size contains the size of the set_tid array. Starting at
|
|
* the most nested currently active PID namespace it tells alloc_pid()
|
|
* which PID to set for a process in that most nested PID namespace
|
|
* up to set_tid_size PID namespaces. It does not have to set the PID
|
|
* for a process in all nested PID namespaces but set_tid_size must
|
|
* never be greater than the current ns->level + 1.
|
|
*/
|
|
if (set_tid_size > ns->level + 1)
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
|
|
if (!pid)
|
|
return ERR_PTR(retval);
|
|
|
|
tmp = ns;
|
|
pid->level = ns->level;
|
|
|
|
for (i = ns->level; i >= 0; i--) {
|
|
int tid = 0;
|
|
|
|
if (set_tid_size) {
|
|
tid = set_tid[ns->level - i];
|
|
|
|
retval = -EINVAL;
|
|
if (tid < 1 || tid >= pid_max)
|
|
goto out_free;
|
|
/*
|
|
* Also fail if a PID != 1 is requested and
|
|
* no PID 1 exists.
|
|
*/
|
|
if (tid != 1 && !tmp->child_reaper)
|
|
goto out_free;
|
|
retval = -EPERM;
|
|
if (!checkpoint_restore_ns_capable(tmp->user_ns))
|
|
goto out_free;
|
|
set_tid_size--;
|
|
}
|
|
|
|
idr_preload(GFP_KERNEL);
|
|
spin_lock_irq(&pidmap_lock);
|
|
|
|
if (tid) {
|
|
nr = idr_alloc(&tmp->idr, NULL, tid,
|
|
tid + 1, GFP_ATOMIC);
|
|
/*
|
|
* If ENOSPC is returned it means that the PID is
|
|
* alreay in use. Return EEXIST in that case.
|
|
*/
|
|
if (nr == -ENOSPC)
|
|
nr = -EEXIST;
|
|
} else {
|
|
int pid_min = 1;
|
|
/*
|
|
* init really needs pid 1, but after reaching the
|
|
* maximum wrap back to RESERVED_PIDS
|
|
*/
|
|
if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
|
|
pid_min = RESERVED_PIDS;
|
|
|
|
/*
|
|
* Store a null pointer so find_pid_ns does not find
|
|
* a partially initialized PID (see below).
|
|
*/
|
|
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
|
|
pid_max, GFP_ATOMIC);
|
|
}
|
|
spin_unlock_irq(&pidmap_lock);
|
|
idr_preload_end();
|
|
|
|
if (nr < 0) {
|
|
retval = (nr == -ENOSPC) ? -EAGAIN : nr;
|
|
goto out_free;
|
|
}
|
|
|
|
pid->numbers[i].nr = nr;
|
|
pid->numbers[i].ns = tmp;
|
|
tmp = tmp->parent;
|
|
}
|
|
|
|
/*
|
|
* ENOMEM is not the most obvious choice especially for the case
|
|
* where the child subreaper has already exited and the pid
|
|
* namespace denies the creation of any new processes. But ENOMEM
|
|
* is what we have exposed to userspace for a long time and it is
|
|
* documented behavior for pid namespaces. So we can't easily
|
|
* change it even if there were an error code better suited.
|
|
*/
|
|
retval = -ENOMEM;
|
|
|
|
get_pid_ns(ns);
|
|
refcount_set(&pid->count, 1);
|
|
spin_lock_init(&pid->lock);
|
|
for (type = 0; type < PIDTYPE_MAX; ++type)
|
|
INIT_HLIST_HEAD(&pid->tasks[type]);
|
|
|
|
init_waitqueue_head(&pid->wait_pidfd);
|
|
INIT_HLIST_HEAD(&pid->inodes);
|
|
|
|
upid = pid->numbers + ns->level;
|
|
spin_lock_irq(&pidmap_lock);
|
|
if (!(ns->pid_allocated & PIDNS_ADDING))
|
|
goto out_unlock;
|
|
for ( ; upid >= pid->numbers; --upid) {
|
|
/* Make the PID visible to find_pid_ns. */
|
|
idr_replace(&upid->ns->idr, pid, upid->nr);
|
|
upid->ns->pid_allocated++;
|
|
}
|
|
spin_unlock_irq(&pidmap_lock);
|
|
|
|
return pid;
|
|
|
|
out_unlock:
|
|
spin_unlock_irq(&pidmap_lock);
|
|
put_pid_ns(ns);
|
|
|
|
out_free:
|
|
spin_lock_irq(&pidmap_lock);
|
|
while (++i <= ns->level) {
|
|
upid = pid->numbers + i;
|
|
idr_remove(&upid->ns->idr, upid->nr);
|
|
}
|
|
|
|
/* On failure to allocate the first pid, reset the state */
|
|
if (ns->pid_allocated == PIDNS_ADDING)
|
|
idr_set_cursor(&ns->idr, 0);
|
|
|
|
spin_unlock_irq(&pidmap_lock);
|
|
|
|
kmem_cache_free(ns->pid_cachep, pid);
|
|
return ERR_PTR(retval);
|
|
}
|
|
|
|
void disable_pid_allocation(struct pid_namespace *ns)
|
|
{
|
|
spin_lock_irq(&pidmap_lock);
|
|
ns->pid_allocated &= ~PIDNS_ADDING;
|
|
spin_unlock_irq(&pidmap_lock);
|
|
}
|
|
|
|
struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
|
|
{
|
|
return idr_find(&ns->idr, nr);
|
|
}
|
|
EXPORT_SYMBOL_GPL(find_pid_ns);
|
|
|
|
struct pid *find_vpid(int nr)
|
|
{
|
|
return find_pid_ns(nr, task_active_pid_ns(current));
|
|
}
|
|
EXPORT_SYMBOL_GPL(find_vpid);
|
|
|
|
static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type)
|
|
{
|
|
return (type == PIDTYPE_PID) ?
|
|
&task->thread_pid :
|
|
&task->signal->pids[type];
|
|
}
|
|
|
|
/*
|
|
* attach_pid() must be called with the tasklist_lock write-held.
|
|
*/
|
|
void attach_pid(struct task_struct *task, enum pid_type type)
|
|
{
|
|
struct pid *pid = *task_pid_ptr(task, type);
|
|
hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
|
|
}
|
|
|
|
static void __change_pid(struct task_struct *task, enum pid_type type,
|
|
struct pid *new)
|
|
{
|
|
struct pid **pid_ptr = task_pid_ptr(task, type);
|
|
struct pid *pid;
|
|
int tmp;
|
|
|
|
pid = *pid_ptr;
|
|
|
|
hlist_del_rcu(&task->pid_links[type]);
|
|
*pid_ptr = new;
|
|
|
|
for (tmp = PIDTYPE_MAX; --tmp >= 0; )
|
|
if (pid_has_task(pid, tmp))
|
|
return;
|
|
|
|
free_pid(pid);
|
|
}
|
|
|
|
void detach_pid(struct task_struct *task, enum pid_type type)
|
|
{
|
|
__change_pid(task, type, NULL);
|
|
}
|
|
|
|
void change_pid(struct task_struct *task, enum pid_type type,
|
|
struct pid *pid)
|
|
{
|
|
__change_pid(task, type, pid);
|
|
attach_pid(task, type);
|
|
}
|
|
|
|
void exchange_tids(struct task_struct *left, struct task_struct *right)
|
|
{
|
|
struct pid *pid1 = left->thread_pid;
|
|
struct pid *pid2 = right->thread_pid;
|
|
struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID];
|
|
struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID];
|
|
|
|
/* Swap the single entry tid lists */
|
|
hlists_swap_heads_rcu(head1, head2);
|
|
|
|
/* Swap the per task_struct pid */
|
|
rcu_assign_pointer(left->thread_pid, pid2);
|
|
rcu_assign_pointer(right->thread_pid, pid1);
|
|
|
|
/* Swap the cached value */
|
|
WRITE_ONCE(left->pid, pid_nr(pid2));
|
|
WRITE_ONCE(right->pid, pid_nr(pid1));
|
|
}
|
|
|
|
/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
|
|
void transfer_pid(struct task_struct *old, struct task_struct *new,
|
|
enum pid_type type)
|
|
{
|
|
if (type == PIDTYPE_PID)
|
|
new->thread_pid = old->thread_pid;
|
|
hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]);
|
|
}
|
|
|
|
struct task_struct *pid_task(struct pid *pid, enum pid_type type)
|
|
{
|
|
struct task_struct *result = NULL;
|
|
if (pid) {
|
|
struct hlist_node *first;
|
|
first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
|
|
lockdep_tasklist_lock_is_held());
|
|
if (first)
|
|
result = hlist_entry(first, struct task_struct, pid_links[(type)]);
|
|
}
|
|
return result;
|
|
}
|
|
EXPORT_SYMBOL(pid_task);
|
|
|
|
/*
|
|
* Must be called under rcu_read_lock().
|
|
*/
|
|
struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
|
|
{
|
|
RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
|
|
"find_task_by_pid_ns() needs rcu_read_lock() protection");
|
|
return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
|
|
}
|
|
|
|
struct task_struct *find_task_by_vpid(pid_t vnr)
|
|
{
|
|
return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
|
|
}
|
|
EXPORT_SYMBOL_GPL(find_task_by_vpid);
|
|
|
|
struct task_struct *find_get_task_by_vpid(pid_t nr)
|
|
{
|
|
struct task_struct *task;
|
|
|
|
rcu_read_lock();
|
|
task = find_task_by_vpid(nr);
|
|
if (task)
|
|
get_task_struct(task);
|
|
rcu_read_unlock();
|
|
|
|
return task;
|
|
}
|
|
|
|
struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
|
|
{
|
|
struct pid *pid;
|
|
rcu_read_lock();
|
|
pid = get_pid(rcu_dereference(*task_pid_ptr(task, type)));
|
|
rcu_read_unlock();
|
|
return pid;
|
|
}
|
|
EXPORT_SYMBOL_GPL(get_task_pid);
|
|
|
|
struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
|
|
{
|
|
struct task_struct *result;
|
|
rcu_read_lock();
|
|
result = pid_task(pid, type);
|
|
if (result)
|
|
get_task_struct(result);
|
|
rcu_read_unlock();
|
|
return result;
|
|
}
|
|
EXPORT_SYMBOL_GPL(get_pid_task);
|
|
|
|
struct pid *find_get_pid(pid_t nr)
|
|
{
|
|
struct pid *pid;
|
|
|
|
rcu_read_lock();
|
|
pid = get_pid(find_vpid(nr));
|
|
rcu_read_unlock();
|
|
|
|
return pid;
|
|
}
|
|
EXPORT_SYMBOL_GPL(find_get_pid);
|
|
|
|
pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
|
|
{
|
|
struct upid *upid;
|
|
pid_t nr = 0;
|
|
|
|
if (pid && ns->level <= pid->level) {
|
|
upid = &pid->numbers[ns->level];
|
|
if (upid->ns == ns)
|
|
nr = upid->nr;
|
|
}
|
|
return nr;
|
|
}
|
|
EXPORT_SYMBOL_GPL(pid_nr_ns);
|
|
|
|
pid_t pid_vnr(struct pid *pid)
|
|
{
|
|
return pid_nr_ns(pid, task_active_pid_ns(current));
|
|
}
|
|
EXPORT_SYMBOL_GPL(pid_vnr);
|
|
|
|
pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
|
|
struct pid_namespace *ns)
|
|
{
|
|
pid_t nr = 0;
|
|
|
|
rcu_read_lock();
|
|
if (!ns)
|
|
ns = task_active_pid_ns(current);
|
|
nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
|
|
rcu_read_unlock();
|
|
|
|
return nr;
|
|
}
|
|
EXPORT_SYMBOL(__task_pid_nr_ns);
|
|
|
|
struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
|
|
{
|
|
return ns_of_pid(task_pid(tsk));
|
|
}
|
|
EXPORT_SYMBOL_GPL(task_active_pid_ns);
|
|
|
|
/*
|
|
* Used by proc to find the first pid that is greater than or equal to nr.
|
|
*
|
|
* If there is a pid at nr this function is exactly the same as find_pid_ns.
|
|
*/
|
|
struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
|
|
{
|
|
return idr_get_next(&ns->idr, &nr);
|
|
}
|
|
EXPORT_SYMBOL_GPL(find_ge_pid);
|
|
|
|
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
|
|
{
|
|
struct fd f;
|
|
struct pid *pid;
|
|
|
|
f = fdget(fd);
|
|
if (!f.file)
|
|
return ERR_PTR(-EBADF);
|
|
|
|
pid = pidfd_pid(f.file);
|
|
if (!IS_ERR(pid)) {
|
|
get_pid(pid);
|
|
*flags = f.file->f_flags;
|
|
}
|
|
|
|
fdput(f);
|
|
return pid;
|
|
}
|
|
|
|
/**
|
|
* pidfd_get_task() - Get the task associated with a pidfd
|
|
*
|
|
* @pidfd: pidfd for which to get the task
|
|
* @flags: flags associated with this pidfd
|
|
*
|
|
* Return the task associated with @pidfd. The function takes a reference on
|
|
* the returned task. The caller is responsible for releasing that reference.
|
|
*
|
|
* Currently, the process identified by @pidfd is always a thread-group leader.
|
|
* This restriction currently exists for all aspects of pidfds including pidfd
|
|
* creation (CLONE_PIDFD cannot be used with CLONE_THREAD) and pidfd polling
|
|
* (only supports thread group leaders).
|
|
*
|
|
* Return: On success, the task_struct associated with the pidfd.
|
|
* On error, a negative errno number will be returned.
|
|
*/
|
|
struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
|
|
{
|
|
unsigned int f_flags;
|
|
struct pid *pid;
|
|
struct task_struct *task;
|
|
|
|
pid = pidfd_get_pid(pidfd, &f_flags);
|
|
if (IS_ERR(pid))
|
|
return ERR_CAST(pid);
|
|
|
|
task = get_pid_task(pid, PIDTYPE_TGID);
|
|
put_pid(pid);
|
|
if (!task)
|
|
return ERR_PTR(-ESRCH);
|
|
|
|
*flags = f_flags;
|
|
return task;
|
|
}
|
|
|
|
/**
|
|
* pidfd_create() - Create a new pid file descriptor.
|
|
*
|
|
* @pid: struct pid that the pidfd will reference
|
|
* @flags: flags to pass
|
|
*
|
|
* This creates a new pid file descriptor with the O_CLOEXEC flag set.
|
|
*
|
|
* Note, that this function can only be called after the fd table has
|
|
* been unshared to avoid leaking the pidfd to the new process.
|
|
*
|
|
* This symbol should not be explicitly exported to loadable modules.
|
|
*
|
|
* Return: On success, a cloexec pidfd is returned.
|
|
* On error, a negative errno number will be returned.
|
|
*/
|
|
int pidfd_create(struct pid *pid, unsigned int flags)
|
|
{
|
|
int pidfd;
|
|
struct file *pidfd_file;
|
|
|
|
pidfd = pidfd_prepare(pid, flags, &pidfd_file);
|
|
if (pidfd < 0)
|
|
return pidfd;
|
|
|
|
fd_install(pidfd, pidfd_file);
|
|
return pidfd;
|
|
}
|
|
|
|
/**
|
|
* pidfd_open() - Open new pid file descriptor.
|
|
*
|
|
* @pid: pid for which to retrieve a pidfd
|
|
* @flags: flags to pass
|
|
*
|
|
* This creates a new pid file descriptor with the O_CLOEXEC flag set for
|
|
* the process identified by @pid. Currently, the process identified by
|
|
* @pid must be a thread-group leader. This restriction currently exists
|
|
* for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
|
|
* be used with CLONE_THREAD) and pidfd polling (only supports thread group
|
|
* leaders).
|
|
*
|
|
* Return: On success, a cloexec pidfd is returned.
|
|
* On error, a negative errno number will be returned.
|
|
*/
|
|
SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
|
|
{
|
|
int fd;
|
|
struct pid *p;
|
|
|
|
if (flags & ~PIDFD_NONBLOCK)
|
|
return -EINVAL;
|
|
|
|
if (pid <= 0)
|
|
return -EINVAL;
|
|
|
|
p = find_get_pid(pid);
|
|
if (!p)
|
|
return -ESRCH;
|
|
|
|
fd = pidfd_create(p, flags);
|
|
|
|
put_pid(p);
|
|
return fd;
|
|
}
|
|
|
|
void __init pid_idr_init(void)
|
|
{
|
|
/* Verify no one has done anything silly: */
|
|
BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
|
|
|
|
/* bump default and minimum pid_max based on number of cpus */
|
|
pid_max = min(pid_max_max, max_t(int, pid_max,
|
|
PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
|
|
pid_max_min = max_t(int, pid_max_min,
|
|
PIDS_PER_CPU_MIN * num_possible_cpus());
|
|
pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
|
|
|
|
idr_init(&init_pid_ns.idr);
|
|
|
|
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
|
|
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
|
|
}
|
|
|
|
static struct file *__pidfd_fget(struct task_struct *task, int fd)
|
|
{
|
|
struct file *file;
|
|
int ret;
|
|
|
|
ret = down_read_killable(&task->signal->exec_update_lock);
|
|
if (ret)
|
|
return ERR_PTR(ret);
|
|
|
|
if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS))
|
|
file = fget_task(task, fd);
|
|
else
|
|
file = ERR_PTR(-EPERM);
|
|
|
|
up_read(&task->signal->exec_update_lock);
|
|
|
|
return file ?: ERR_PTR(-EBADF);
|
|
}
|
|
|
|
static int pidfd_getfd(struct pid *pid, int fd)
|
|
{
|
|
struct task_struct *task;
|
|
struct file *file;
|
|
int ret;
|
|
|
|
task = get_pid_task(pid, PIDTYPE_PID);
|
|
if (!task)
|
|
return -ESRCH;
|
|
|
|
file = __pidfd_fget(task, fd);
|
|
put_task_struct(task);
|
|
if (IS_ERR(file))
|
|
return PTR_ERR(file);
|
|
|
|
ret = receive_fd(file, O_CLOEXEC);
|
|
fput(file);
|
|
|
|
return ret;
|
|
}
|
|
|
|
/**
|
|
* sys_pidfd_getfd() - Get a file descriptor from another process
|
|
*
|
|
* @pidfd: the pidfd file descriptor of the process
|
|
* @fd: the file descriptor number to get
|
|
* @flags: flags on how to get the fd (reserved)
|
|
*
|
|
* This syscall gets a copy of a file descriptor from another process
|
|
* based on the pidfd, and file descriptor number. It requires that
|
|
* the calling process has the ability to ptrace the process represented
|
|
* by the pidfd. The process which is having its file descriptor copied
|
|
* is otherwise unaffected.
|
|
*
|
|
* Return: On success, a cloexec file descriptor is returned.
|
|
* On error, a negative errno number will be returned.
|
|
*/
|
|
SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd,
|
|
unsigned int, flags)
|
|
{
|
|
struct pid *pid;
|
|
struct fd f;
|
|
int ret;
|
|
|
|
/* flags is currently unused - make sure it's unset */
|
|
if (flags)
|
|
return -EINVAL;
|
|
|
|
f = fdget(pidfd);
|
|
if (!f.file)
|
|
return -EBADF;
|
|
|
|
pid = pidfd_pid(f.file);
|
|
if (IS_ERR(pid))
|
|
ret = PTR_ERR(pid);
|
|
else
|
|
ret = pidfd_getfd(pid, fd);
|
|
|
|
fdput(f);
|
|
return ret;
|
|
}
|