Files
linux/kernel/pid.c
Greg Kroah-Hartman ef60b4555d Merge 6.1.141 into android14-6.1-lts
Changes in 6.1.141
	gpio: pca953x: Add missing header(s)
	gpio: pca953x: Split pca953x_restore_context() and pca953x_save_context()
	gpio: pca953x: Simplify code with cleanup helpers
	gpio: pca953x: fix IRQ storm on system wake up
	phy: renesas: rcar-gen3-usb2: Add support to initialize the bus
	phy: renesas: rcar-gen3-usb2: Move IRQ request in probe
	phy: renesas: rcar-gen3-usb2: Lock around hardware registers and driver data
	phy: renesas: rcar-gen3-usb2: Assert PLL reset on PHY power off
	scsi: target: iscsi: Fix timeout on deleted connection
	virtio_ring: Fix data race by tagging event_triggered as racy for KCSAN
	dma-mapping: avoid potential unused data compilation warning
	cgroup: Fix compilation issue due to cgroup_mutex not being exported
	scsi: mpi3mr: Add level check to control event logging
	net: enetc: refactor bulk flipping of RX buffers to separate function
	drm/amdgpu: Allow P2P access through XGMI
	selftests/bpf: Mitigate sockmap_ktls disconnect_after_delete failure
	bpf: fix possible endless loop in BPF map iteration
	samples/bpf: Fix compilation failure for samples/bpf on LoongArch Fedora
	kconfig: merge_config: use an empty file as initfile
	s390/vfio-ap: Fix no AP queue sharing allowed message written to kernel log
	cifs: Add fallback for SMB2 CREATE without FILE_READ_ATTRIBUTES
	cifs: Fix querying and creating MF symlinks over SMB1
	cifs: Fix negotiate retry functionality
	fuse: Return EPERM rather than ENOSYS from link()
	NFSv4: Check for delegation validity in nfs_start_delegation_return_locked()
	NFS: Don't allow waiting for exiting tasks
	SUNRPC: Don't allow waiting for exiting tasks
	arm64: Add support for HIP09 Spectre-BHB mitigation
	tracing: Mark binary printing functions with __printf() attribute
	mailbox: use error ret code of of_parse_phandle_with_args()
	fbdev: fsl-diu-fb: add missing device_remove_file()
	fbcon: Use correct erase colour for clearing in fbcon
	fbdev: core: tileblit: Implement missing margin clearing for tileblit
	cifs: Fix establishing NetBIOS session for SMB2+ connection
	NFSv4: Treat ENETUNREACH errors as fatal for state recovery
	SUNRPC: rpc_clnt_set_transport() must not change the autobind setting
	SUNRPC: rpcbind should never reset the port to the value '0'
	thermal/drivers/qoriq: Power down TMU on system suspend
	dql: Fix dql->limit value when reset.
	lockdep: Fix wait context check on softirq for PREEMPT_RT
	objtool: Properly disable uaccess validation
	PCI: dwc: ep: Ensure proper iteration over outbound map windows
	tools/build: Don't pass test log files to linker
	pNFS/flexfiles: Report ENETDOWN as a connection error
	PCI: vmd: Disable MSI remapping bypass under Xen
	libnvdimm/labels: Fix divide error in nd_label_data_init()
	mmc: host: Wait for Vdd to settle on card power off
	x86/mm: Check return value from memblock_phys_alloc_range()
	i2c: qup: Vote for interconnect bandwidth to DRAM
	i2c: pxa: fix call balance of i2c->clk handling routines
	btrfs: make btrfs_discard_workfn() block_group ref explicit
	btrfs: avoid linker error in btrfs_find_create_tree_block()
	btrfs: run btrfs_error_commit_super() early
	btrfs: fix non-empty delayed iputs list on unmount due to async workers
	btrfs: get zone unusable bytes while holding lock at btrfs_reclaim_bgs_work()
	btrfs: send: return -ENAMETOOLONG when attempting a path that is too long
	drm/amd/display: Guard against setting dispclk low for dcn31x
	i3c: master: svc: Fix missing STOP for master request
	dlm: make tcp still work in multi-link env
	um: Store full CSGSFS and SS register from mcontext
	um: Update min_low_pfn to match changes in uml_reserved
	ext4: reorder capability check last
	scsi: st: Tighten the page format heuristics with MODE SELECT
	scsi: st: ERASE does not change tape location
	vfio/pci: Handle INTx IRQ_NOTCONNECTED
	bpf: Return prog btf_id without capable check
	tcp: reorganize tcp_in_ack_event() and tcp_count_delivered()
	rtc: rv3032: fix EERD location
	thunderbolt: Do not add non-active NVM if NVM upgrade is disabled for retimer
	ASoC: mediatek: mt6359: Add stub for mt6359_accdet_enable_jack_detect
	kbuild: fix argument parsing in scripts/config
	crypto: octeontx2 - suppress auth failure screaming due to negative tests
	dm: restrict dm device size to 2^63-512 bytes
	net/smc: use the correct ndev to find pnetid by pnetid table
	xen: Add support for XenServer 6.1 platform device
	pinctrl-tegra: Restore SFSEL bit when freeing pins
	ASoC: sun4i-codec: support hp-det-gpios property
	ext4: reject the 'data_err=abort' option in nojournal mode
	RDMA/uverbs: Propagate errors from rdma_lookup_get_uobject()
	posix-timers: Add cond_resched() to posix_timer_add() search loop
	timer_list: Don't use %pK through printk()
	netfilter: conntrack: Bound nf_conntrack sysctl writes
	arm64/mm: Check PUD_TYPE_TABLE in pud_bad()
	mmc: dw_mmc: add exynos7870 DW MMC support
	mmc: sdhci: Disable SD card clock before changing parameters
	hwmon: (dell-smm) Increment the number of fans
	ipv6: save dontfrag in cork
	drm/amd/display: calculate the remain segments for all pipes
	gfs2: Check for empty queue in run_queue
	auxdisplay: charlcd: Partially revert "Move hwidth and bwidth to struct hd44780_common"
	ASoC: qcom: sm8250: explicitly set format in sm8250_be_hw_params_fixup()
	iommu/amd/pgtbl_v2: Improve error handling
	cpufreq: tegra186: Share policy per cluster
	crypto: lzo - Fix compression buffer overrun
	arm64: tegra: p2597: Fix gpio for vdd-1v8-dis regulator
	powerpc/prom_init: Fixup missing #size-cells on PowerBook6,7
	ALSA: seq: Improve data consistency at polling
	tcp: bring back NUMA dispersion in inet_ehash_locks_alloc()
	rtc: ds1307: stop disabling alarms on probe
	ieee802154: ca8210: Use proper setters and getters for bitwise types
	ARM: tegra: Switch DSI-B clock parent to PLLD on Tegra114
	media: c8sectpfe: Call of_node_put(i2c_bus) only once in c8sectpfe_probe()
	dm cache: prevent BUG_ON by blocking retries on failed device resumes
	orangefs: Do not truncate file size
	net: phylink: use pl->link_interface in phylink_expects_phy()
	remoteproc: qcom_wcnss: Handle platforms with only single power domain
	drm/amdgpu: Do not program AGP BAR regs under SRIOV in gfxhub_v1_0.c
	media: cx231xx: set device_caps for 417
	pinctrl: bcm281xx: Use "unsigned int" instead of bare "unsigned"
	net: ethernet: ti: cpsw_new: populate netdev of_node
	net: pktgen: fix mpls maximum labels list parsing
	perf/hw_breakpoint: Return EOPNOTSUPP for unsupported breakpoint type
	ALSA: hda/realtek: Enable PC beep passthrough for HP EliteBook 855 G7
	ipv4: fib: Move fib_valid_key_len() to rtm_to_fib_config().
	drm/rockchip: vop2: Add uv swap for cluster window
	media: uvcvideo: Add sanity check to uvc_ioctl_xu_ctrl_map
	clk: imx8mp: inform CCF of maximum frequency of clocks
	x86/bugs: Make spectre user default depend on MITIGATION_SPECTRE_V2
	hwmon: (gpio-fan) Add missing mutex locks
	ARM: at91: pm: fix at91_suspend_finish for ZQ calibration
	drm/mediatek: mtk_dpi: Add checks for reg_h_fre_con existence
	fpga: altera-cvp: Increase credit timeout
	soc: apple: rtkit: Use high prio work queue
	soc: apple: rtkit: Implement OSLog buffers properly
	PCI: brcmstb: Expand inbound window size up to 64GB
	PCI: brcmstb: Add a softdep to MIP MSI-X driver
	firmware: arm_ffa: Set dma_mask for ffa devices
	net/mlx5: Avoid report two health errors on same syndrome
	selftests/net: have `gro.sh -t` return a correct exit code
	drm/amdkfd: KFD release_work possible circular locking
	leds: pwm-multicolor: Add check for fwnode_property_read_u32
	net: ethernet: mtk_ppe_offload: Allow QinQ, double ETH_P_8021Q only
	net: xgene-v2: remove incorrect ACPI_PTR annotation
	bonding: report duplicate MAC address in all situations
	soc: ti: k3-socinfo: Do not use syscon helper to build regmap
	x86/build: Fix broken copy command in genimage.sh when making isoimage
	drm/amd/display: handle max_downscale_src_width fail check
	x86/nmi: Add an emergency handler in nmi_desc & use it in nmi_shootdown_cpus()
	cpuidle: menu: Avoid discarding useful information
	media: adv7180: Disable test-pattern control on adv7180
	libbpf: Fix out-of-bound read
	dm: fix unconditional IO throttle caused by REQ_PREFLUSH
	x86/kaslr: Reduce KASLR entropy on most x86 systems
	MIPS: Use arch specific syscall name match function
	genirq/msi: Store the IOMMU IOVA directly in msi_desc instead of iommu_cookie
	MIPS: pm-cps: Use per-CPU variables as per-CPU, not per-core
	clocksource: mips-gic-timer: Enable counter when CPUs start
	scsi: mpt3sas: Send a diag reset if target reset fails
	wifi: rtw88: Fix rtw_init_vht_cap() for RTL8814AU
	wifi: rtw88: Fix rtw_init_ht_cap() for RTL8814AU
	wifi: rtw88: Fix rtw_desc_to_mcsrate() to handle MCS16-31
	wifi: rtw89: fw: propagate error code from rtw89_h2c_tx()
	net: pktgen: fix access outside of user given buffer in pktgen_thread_write()
	EDAC/ie31200: work around false positive build warning
	i3c: master: svc: Flush FIFO before sending Dynamic Address Assignment(DAA)
	serial: mctrl_gpio: split disable_ms into sync and no_sync APIs
	RDMA/core: Fix best page size finding when it can cross SG entries
	pmdomain: imx: gpcv2: use proper helper for property detection
	can: c_can: Use of_property_present() to test existence of DT property
	eth: mlx4: don't try to complete XDP frames in netpoll
	PCI: Fix old_size lower bound in calculate_iosize() too
	ACPI: HED: Always initialize before evged
	vxlan: Join / leave MC group after remote changes
	media: test-drivers: vivid: don't call schedule in loop
	net/mlx5: Modify LSB bitmask in temperature event to include only the first bit
	net/mlx5: Apply rate-limiting to high temperature warning
	ASoC: ops: Enforce platform maximum on initial value
	ASoC: tas2764: Add reg defaults for TAS2764_INT_CLK_CFG
	ASoC: tas2764: Mark SW_RESET as volatile
	ASoC: tas2764: Power up/down amp on mute ops
	ASoC: soc-dai: check return value at snd_soc_dai_set_tdm_slot()
	pinctrl: devicetree: do not goto err when probing hogs in pinctrl_dt_to_map
	smack: recognize ipv4 CIPSO w/o categories
	kunit: tool: Use qboot on QEMU x86_64
	net/mlx4_core: Avoid impossible mlx4_db_alloc() order value
	clk: qcom: clk-alpha-pll: Do not use random stack value for recalc rate
	serial: sh-sci: Update the suspend/resume support
	phy: core: don't require set_mode() callback for phy_get_mode() to work
	drm/amdgpu: reset psp->cmd to NULL after releasing the buffer
	drm/amd/display: Initial psr_version with correct setting
	drm/amdgpu: enlarge the VBIOS binary size limit
	drm/amd/display/dm: drop hw_support check in amdgpu_dm_i2c_xfer()
	net/mlx5: Extend Ethtool loopback selftest to support non-linear SKB
	net/mlx5e: set the tx_queue_len for pfifo_fast
	net/mlx5e: reduce rep rxq depth to 256 for ECPF
	wifi: mac80211: don't unconditionally call drv_mgd_complete_tx()
	wifi: mac80211: remove misplaced drv_mgd_complete_tx() call
	arch/powerpc/perf: Check the instruction type before creating sample with perf_mem_data_src
	ip: fib_rules: Fetch net from fib_rule in fib[46]_rule_configure().
	r8152: add vendor/device ID pair for Dell Alienware AW1022z
	wifi: rtw88: Fix download_firmware_validate() for RTL8814AU
	clk: qcom: camcc-sm8250: Use clk_rcg2_shared_ops for some RCGs
	hwmon: (xgene-hwmon) use appropriate type for the latency value
	media: qcom: camss: csid: Only add TPG v4l2 ctrl if TPG hardware is available
	vxlan: Annotate FDB data races
	r8169: don't scan PHY addresses > 0
	rcu: handle quiescent states for PREEMPT_RCU=n, PREEMPT_COUNT=y
	rcu: handle unstable rdp in rcu_read_unlock_strict()
	rcu: fix header guard for rcu_all_qs()
	perf: Avoid the read if the count is already updated
	ice: count combined queues using Rx/Tx count
	net/mana: fix warning in the writer of client oob
	scsi: lpfc: Handle duplicate D_IDs in ndlp search-by D_ID routine
	scsi: lpfc: Free phba irq in lpfc_sli4_enable_msi() when pci_irq_vector() fails
	scsi: st: Restore some drive settings after reset
	HID: usbkbd: Fix the bit shift number for LED_KANA
	ASoC: codecs: pcm3168a: Allow for 24-bit in provider mode
	drm/ast: Find VBIOS mode from regular display size
	bpftool: Fix readlink usage in get_fd_type
	perf/amd/ibs: Fix perf_ibs_op.cnt_mask for CurCnt
	wifi: rtl8xxxu: retry firmware download on error
	wifi: rtw88: Don't use static local variable in rtw8822b_set_tx_power_index_by_rate
	wifi: rtw89: add wiphy_lock() to work that isn't held wiphy_lock() yet
	spi: zynqmp-gqspi: Always acknowledge interrupts
	regulator: ad5398: Add device tree support
	wifi: ath9k: return by of_get_mac_address
	drm/atomic: clarify the rules around drm_atomic_state->allow_modeset
	drm/panel-edp: Add Starry 116KHD024006
	drm: Add valid clones check
	ASoC: imx-card: Adjust over allocation of memory in imx_card_parse_of()
	pinctrl: meson: define the pull up/down resistor value as 60 kOhm
	ASoC: Intel: bytcr_rt5640: Add DMI quirk for Acer Aspire SW3-013
	ALSA: hda/realtek: Add quirk for HP Spectre x360 15-df1xxx
	nvmet-tcp: don't restore null sk_state_change
	io_uring/fdinfo: annotate racy sq/cq head/tail reads
	btrfs: correct the order of prelim_ref arguments in btrfs__prelim_ref
	wifi: iwlwifi: add support for Killer on MTL
	xenbus: Allow PVH dom0 a non-local xenstore
	__legitimize_mnt(): check for MNT_SYNC_UMOUNT should be under mount_lock
	espintcp: remove encap socket caching to avoid reference leak
	dmaengine: idxd: add per DSA wq workqueue for processing cr faults
	dmaengine: idxd: add idxd_copy_cr() to copy user completion record during page fault handling
	dmaengine: idxd: Fix allowing write() from different address spaces
	remoteproc: qcom_wcnss: Fix on platforms without fallback regulators
	clk: sunxi-ng: d1: Add missing divider for MMC mod clocks
	xfrm: Sanitize marks before insert
	dmaengine: idxd: Fix ->poll() return value
	Bluetooth: L2CAP: Fix not checking l2cap_chan security level
	bridge: netfilter: Fix forwarding of fragmented packets
	ice: fix vf->num_mac count with port representors
	net: dwmac-sun8i: Use parsed internal PHY address instead of 1
	net: lan743x: Restore SGMII CTRL register on resume
	io_uring: fix overflow resched cqe reordering
	sch_hfsc: Fix qlen accounting bug when using peek in hfsc_enqueue()
	octeontx2-pf: Add support for page pool
	octeontx2-pf: Add AF_XDP non-zero copy support
	net/tipc: fix slab-use-after-free Read in tipc_aead_encrypt_done
	octeontx2-af: Set LMT_ENA bit for APR table entries
	octeontx2-af: Fix APR entry mapping based on APR_LMT_CFG
	crypto: algif_hash - fix double free in hash_accept
	padata: do not leak refcount in reorder_work
	can: slcan: allow reception of short error messages
	can: bcm: add locking for bcm_op runtime updates
	can: bcm: add missing rcu read protection for procfs content
	ALSA: pcm: Fix race of buffer access at PCM OSS layer
	ALSA: hda/realtek: Add quirk for Lenovo Yoga Pro 7 14ASP10
	llc: fix data loss when reading from a socket in llc_ui_recvmsg()
	platform/x86: dell-wmi-sysman: Avoid buffer overflow in current_password_store()
	drm/edid: fixed the bug that hdr metadata was not reset
	smb: client: Fix use-after-free in cifs_fill_dirent
	smb: client: Reset all search buffer pointers when releasing buffer
	Revert "drm/amd: Keep display off while going into S4"
	memcg: always call cond_resched() after fn()
	mm/page_alloc.c: avoid infinite retries caused by cpuset race
	Revert "arm64: dts: allwinner: h6: Use RSB for AXP805 PMIC connection"
	ksmbd: fix stream write failure
	spi: spi-fsl-dspi: restrict register range for regmap access
	spi: spi-fsl-dspi: Halt the module after a new message transfer
	spi: spi-fsl-dspi: Reset SR flags before sending a new message
	kbuild: Disable -Wdefault-const-init-unsafe
	serial: sh-sci: Save and restore more registers
	pinctrl: tegra: Fix off by one in tegra_pinctrl_get_group()
	i3c: master: svc: Fix implicit fallthrough in svc_i3c_master_ibi_work()
	x86/mm/init: Handle the special case of device private pages in add_pages(), to not increase max_pfn and trigger dma_addressing_limited() bounce buffers bounce buffers
	dmaengine: idxd: Fix passing freed memory in idxd_cdev_open()
	octeontx2-pf: fix page_pool creation fail for rings > 32k
	octeontx2-pf: Fix page pool cache index corruption.
	octeontx2-pf: Fix page pool frag allocation warning
	hrtimers: Force migrate away hrtimers queued after CPUHP_AP_HRTIMERS_DYING
	btrfs: check folio mapping after unlock in relocate_one_folio()
	af_unix: Kconfig: make CONFIG_UNIX bool
	af_unix: Return struct unix_sock from unix_get_socket().
	af_unix: Run GC on only one CPU.
	af_unix: Try to run GC async.
	af_unix: Replace BUG_ON() with WARN_ON_ONCE().
	af_unix: Remove io_uring code for GC.
	af_unix: Remove CONFIG_UNIX_SCM.
	af_unix: Allocate struct unix_vertex for each inflight AF_UNIX fd.
	af_unix: Allocate struct unix_edge for each inflight AF_UNIX fd.
	af_unix: Link struct unix_edge when queuing skb.
	af_unix: Bulk update unix_tot_inflight/unix_inflight when queuing skb.
	af_unix: Iterate all vertices by DFS.
	af_unix: Detect Strongly Connected Components.
	af_unix: Save listener for embryo socket.
	af_unix: Fix up unix_edge.successor for embryo socket.
	af_unix: Save O(n) setup of Tarjan's algo.
	af_unix: Skip GC if no cycle exists.
	af_unix: Avoid Tarjan's algorithm if unnecessary.
	af_unix: Assign a unique index to SCC.
	af_unix: Detect dead SCC.
	af_unix: Replace garbage collection algorithm.
	af_unix: Remove lock dance in unix_peek_fds().
	af_unix: Try not to hold unix_gc_lock during accept().
	af_unix: Don't access successor in unix_del_edges() during GC.
	af_unix: Add dead flag to struct scm_fp_list.
	af_unix: Fix garbage collection of embryos carrying OOB with SCM_RIGHTS
	af_unix: Fix uninit-value in __unix_walk_scc()
	arm64: dts: qcom: sm8350: Fix typo in pil_camera_mem node
	net_sched: hfsc: Address reentrant enqueue adding class to eltree twice
	perf/arm-cmn: Fix REQ2/SNP2 mixup
	perf/arm-cmn: Initialise cmn->cpu earlier
	coredump: fix error handling for replace_fd()
	pid: add pidfd_prepare()
	fork: use pidfd_prepare()
	coredump: hand a pidfd to the usermode coredump helper
	HID: quirks: Add ADATA XPG alpha wireless mouse support
	nfs: don't share pNFS DS connections between net namespaces
	platform/x86: thinkpad_acpi: Support also NEC Lavie X1475JAS
	um: let 'make clean' properly clean underlying SUBARCH as well
	spi: spi-sun4i: fix early activation
	nvme-pci: add NVME_QUIRK_NO_DEEPEST_PS quirk for SOLIDIGM P44 Pro
	NFS: Avoid flushing data while holding directory locks in nfs_rename()
	platform/x86: fujitsu-laptop: Support Lifebook S2110 hotkeys
	platform/x86: thinkpad_acpi: Ignore battery threshold change event notification
	net: ethernet: ti: am65-cpsw: Lower random mac address error print to info
	Linux 6.1.141

Change-Id: I4b93f8e69385f2087bf71545f58ae6f5cee1c5ba
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
2025-06-05 07:17:16 +00:00

744 lines
18 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* Generic pidhash and scalable, time-bounded PID allocator
*
* (C) 2002-2003 Nadia Yvette Chambers, IBM
* (C) 2004 Nadia Yvette Chambers, Oracle
* (C) 2002-2004 Ingo Molnar, Red Hat
*
* pid-structures are backing objects for tasks sharing a given ID to chain
* against. There is very little to them aside from hashing them and
* parking tasks using given ID's on a list.
*
* The hash is always changed with the tasklist_lock write-acquired,
* and the hash is only accessed with the tasklist_lock at least
* read-acquired, so there's no additional SMP locking needed here.
*
* We have a list of bitmap pages, which bitmaps represent the PID space.
* Allocating and freeing PIDs is completely lockless. The worst-case
* allocation scenario when all but one out of 1 million PIDs possible are
* allocated already: the scanning of 32 list entries and at most PAGE_SIZE
* bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
*
* Pid namespaces:
* (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
* (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
* Many thanks to Oleg Nesterov for comments and help
*
*/
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/rculist.h>
#include <linux/memblock.h>
#include <linux/pid_namespace.h>
#include <linux/init_task.h>
#include <linux/syscalls.h>
#include <linux/proc_ns.h>
#include <linux/refcount.h>
#include <linux/anon_inodes.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/idr.h>
#include <net/sock.h>
#include <uapi/linux/pidfd.h>
struct pid init_struct_pid = {
.count = REFCOUNT_INIT(1),
.tasks = {
{ .first = NULL },
{ .first = NULL },
{ .first = NULL },
},
.level = 0,
.numbers = { {
.nr = 0,
.ns = &init_pid_ns,
}, }
};
int pid_max = PID_MAX_DEFAULT;
#define RESERVED_PIDS 300
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
/*
* PID-map pages start out as NULL, they get allocated upon
* first use and are never deallocated. This way a low pid_max
* value does not cause lots of bitmaps to be allocated, but
* the scheme scales to up to 4 million PIDs, runtime.
*/
struct pid_namespace init_pid_ns = {
.ns.count = REFCOUNT_INIT(2),
.idr = IDR_INIT(init_pid_ns.idr),
.pid_allocated = PIDNS_ADDING,
.level = 0,
.child_reaper = &init_task,
.user_ns = &init_user_ns,
.ns.inum = PROC_PID_INIT_INO,
#ifdef CONFIG_PID_NS
.ns.ops = &pidns_operations,
#endif
};
EXPORT_SYMBOL_GPL(init_pid_ns);
/*
* Note: disable interrupts while the pidmap_lock is held as an
* interrupt might come in and do read_lock(&tasklist_lock).
*
* If we don't disable interrupts there is a nasty deadlock between
* detach_pid()->free_pid() and another cpu that does
* spin_lock(&pidmap_lock) followed by an interrupt routine that does
* read_lock(&tasklist_lock);
*
* After we clean up the tasklist_lock and know there are no
* irq handlers that take it we can leave the interrupts enabled.
* For now it is easier to be safe than to prove it can't happen.
*/
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
void put_pid(struct pid *pid)
{
struct pid_namespace *ns;
if (!pid)
return;
ns = pid->numbers[pid->level].ns;
if (refcount_dec_and_test(&pid->count)) {
kmem_cache_free(ns->pid_cachep, pid);
put_pid_ns(ns);
}
}
EXPORT_SYMBOL_GPL(put_pid);
static void delayed_put_pid(struct rcu_head *rhp)
{
struct pid *pid = container_of(rhp, struct pid, rcu);
put_pid(pid);
}
void free_pid(struct pid *pid)
{
/* We can be called with write_lock_irq(&tasklist_lock) held */
int i;
unsigned long flags;
spin_lock_irqsave(&pidmap_lock, flags);
for (i = 0; i <= pid->level; i++) {
struct upid *upid = pid->numbers + i;
struct pid_namespace *ns = upid->ns;
switch (--ns->pid_allocated) {
case 2:
case 1:
/* When all that is left in the pid namespace
* is the reaper wake up the reaper. The reaper
* may be sleeping in zap_pid_ns_processes().
*/
wake_up_process(ns->child_reaper);
break;
case PIDNS_ADDING:
/* Handle a fork failure of the first process */
WARN_ON(ns->child_reaper);
ns->pid_allocated = 0;
break;
}
idr_remove(&ns->idr, upid->nr);
}
spin_unlock_irqrestore(&pidmap_lock, flags);
call_rcu(&pid->rcu, delayed_put_pid);
}
struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
size_t set_tid_size)
{
struct pid *pid;
enum pid_type type;
int i, nr;
struct pid_namespace *tmp;
struct upid *upid;
int retval = -ENOMEM;
/*
* set_tid_size contains the size of the set_tid array. Starting at
* the most nested currently active PID namespace it tells alloc_pid()
* which PID to set for a process in that most nested PID namespace
* up to set_tid_size PID namespaces. It does not have to set the PID
* for a process in all nested PID namespaces but set_tid_size must
* never be greater than the current ns->level + 1.
*/
if (set_tid_size > ns->level + 1)
return ERR_PTR(-EINVAL);
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
if (!pid)
return ERR_PTR(retval);
tmp = ns;
pid->level = ns->level;
for (i = ns->level; i >= 0; i--) {
int tid = 0;
if (set_tid_size) {
tid = set_tid[ns->level - i];
retval = -EINVAL;
if (tid < 1 || tid >= pid_max)
goto out_free;
/*
* Also fail if a PID != 1 is requested and
* no PID 1 exists.
*/
if (tid != 1 && !tmp->child_reaper)
goto out_free;
retval = -EPERM;
if (!checkpoint_restore_ns_capable(tmp->user_ns))
goto out_free;
set_tid_size--;
}
idr_preload(GFP_KERNEL);
spin_lock_irq(&pidmap_lock);
if (tid) {
nr = idr_alloc(&tmp->idr, NULL, tid,
tid + 1, GFP_ATOMIC);
/*
* If ENOSPC is returned it means that the PID is
* alreay in use. Return EEXIST in that case.
*/
if (nr == -ENOSPC)
nr = -EEXIST;
} else {
int pid_min = 1;
/*
* init really needs pid 1, but after reaching the
* maximum wrap back to RESERVED_PIDS
*/
if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
pid_min = RESERVED_PIDS;
/*
* Store a null pointer so find_pid_ns does not find
* a partially initialized PID (see below).
*/
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
pid_max, GFP_ATOMIC);
}
spin_unlock_irq(&pidmap_lock);
idr_preload_end();
if (nr < 0) {
retval = (nr == -ENOSPC) ? -EAGAIN : nr;
goto out_free;
}
pid->numbers[i].nr = nr;
pid->numbers[i].ns = tmp;
tmp = tmp->parent;
}
/*
* ENOMEM is not the most obvious choice especially for the case
* where the child subreaper has already exited and the pid
* namespace denies the creation of any new processes. But ENOMEM
* is what we have exposed to userspace for a long time and it is
* documented behavior for pid namespaces. So we can't easily
* change it even if there were an error code better suited.
*/
retval = -ENOMEM;
get_pid_ns(ns);
refcount_set(&pid->count, 1);
spin_lock_init(&pid->lock);
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(&pid->tasks[type]);
init_waitqueue_head(&pid->wait_pidfd);
INIT_HLIST_HEAD(&pid->inodes);
upid = pid->numbers + ns->level;
spin_lock_irq(&pidmap_lock);
if (!(ns->pid_allocated & PIDNS_ADDING))
goto out_unlock;
for ( ; upid >= pid->numbers; --upid) {
/* Make the PID visible to find_pid_ns. */
idr_replace(&upid->ns->idr, pid, upid->nr);
upid->ns->pid_allocated++;
}
spin_unlock_irq(&pidmap_lock);
return pid;
out_unlock:
spin_unlock_irq(&pidmap_lock);
put_pid_ns(ns);
out_free:
spin_lock_irq(&pidmap_lock);
while (++i <= ns->level) {
upid = pid->numbers + i;
idr_remove(&upid->ns->idr, upid->nr);
}
/* On failure to allocate the first pid, reset the state */
if (ns->pid_allocated == PIDNS_ADDING)
idr_set_cursor(&ns->idr, 0);
spin_unlock_irq(&pidmap_lock);
kmem_cache_free(ns->pid_cachep, pid);
return ERR_PTR(retval);
}
void disable_pid_allocation(struct pid_namespace *ns)
{
spin_lock_irq(&pidmap_lock);
ns->pid_allocated &= ~PIDNS_ADDING;
spin_unlock_irq(&pidmap_lock);
}
struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
{
return idr_find(&ns->idr, nr);
}
EXPORT_SYMBOL_GPL(find_pid_ns);
struct pid *find_vpid(int nr)
{
return find_pid_ns(nr, task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(find_vpid);
static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type)
{
return (type == PIDTYPE_PID) ?
&task->thread_pid :
&task->signal->pids[type];
}
/*
* attach_pid() must be called with the tasklist_lock write-held.
*/
void attach_pid(struct task_struct *task, enum pid_type type)
{
struct pid *pid = *task_pid_ptr(task, type);
hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
}
static void __change_pid(struct task_struct *task, enum pid_type type,
struct pid *new)
{
struct pid **pid_ptr = task_pid_ptr(task, type);
struct pid *pid;
int tmp;
pid = *pid_ptr;
hlist_del_rcu(&task->pid_links[type]);
*pid_ptr = new;
for (tmp = PIDTYPE_MAX; --tmp >= 0; )
if (pid_has_task(pid, tmp))
return;
free_pid(pid);
}
void detach_pid(struct task_struct *task, enum pid_type type)
{
__change_pid(task, type, NULL);
}
void change_pid(struct task_struct *task, enum pid_type type,
struct pid *pid)
{
__change_pid(task, type, pid);
attach_pid(task, type);
}
void exchange_tids(struct task_struct *left, struct task_struct *right)
{
struct pid *pid1 = left->thread_pid;
struct pid *pid2 = right->thread_pid;
struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID];
struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID];
/* Swap the single entry tid lists */
hlists_swap_heads_rcu(head1, head2);
/* Swap the per task_struct pid */
rcu_assign_pointer(left->thread_pid, pid2);
rcu_assign_pointer(right->thread_pid, pid1);
/* Swap the cached value */
WRITE_ONCE(left->pid, pid_nr(pid2));
WRITE_ONCE(right->pid, pid_nr(pid1));
}
/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
void transfer_pid(struct task_struct *old, struct task_struct *new,
enum pid_type type)
{
if (type == PIDTYPE_PID)
new->thread_pid = old->thread_pid;
hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]);
}
struct task_struct *pid_task(struct pid *pid, enum pid_type type)
{
struct task_struct *result = NULL;
if (pid) {
struct hlist_node *first;
first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
lockdep_tasklist_lock_is_held());
if (first)
result = hlist_entry(first, struct task_struct, pid_links[(type)]);
}
return result;
}
EXPORT_SYMBOL(pid_task);
/*
* Must be called under rcu_read_lock().
*/
struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
{
RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
"find_task_by_pid_ns() needs rcu_read_lock() protection");
return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
}
struct task_struct *find_task_by_vpid(pid_t vnr)
{
return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(find_task_by_vpid);
struct task_struct *find_get_task_by_vpid(pid_t nr)
{
struct task_struct *task;
rcu_read_lock();
task = find_task_by_vpid(nr);
if (task)
get_task_struct(task);
rcu_read_unlock();
return task;
}
struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
{
struct pid *pid;
rcu_read_lock();
pid = get_pid(rcu_dereference(*task_pid_ptr(task, type)));
rcu_read_unlock();
return pid;
}
EXPORT_SYMBOL_GPL(get_task_pid);
struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
{
struct task_struct *result;
rcu_read_lock();
result = pid_task(pid, type);
if (result)
get_task_struct(result);
rcu_read_unlock();
return result;
}
EXPORT_SYMBOL_GPL(get_pid_task);
struct pid *find_get_pid(pid_t nr)
{
struct pid *pid;
rcu_read_lock();
pid = get_pid(find_vpid(nr));
rcu_read_unlock();
return pid;
}
EXPORT_SYMBOL_GPL(find_get_pid);
pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
{
struct upid *upid;
pid_t nr = 0;
if (pid && ns->level <= pid->level) {
upid = &pid->numbers[ns->level];
if (upid->ns == ns)
nr = upid->nr;
}
return nr;
}
EXPORT_SYMBOL_GPL(pid_nr_ns);
pid_t pid_vnr(struct pid *pid)
{
return pid_nr_ns(pid, task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(pid_vnr);
pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
struct pid_namespace *ns)
{
pid_t nr = 0;
rcu_read_lock();
if (!ns)
ns = task_active_pid_ns(current);
nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
rcu_read_unlock();
return nr;
}
EXPORT_SYMBOL(__task_pid_nr_ns);
struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
{
return ns_of_pid(task_pid(tsk));
}
EXPORT_SYMBOL_GPL(task_active_pid_ns);
/*
* Used by proc to find the first pid that is greater than or equal to nr.
*
* If there is a pid at nr this function is exactly the same as find_pid_ns.
*/
struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
{
return idr_get_next(&ns->idr, &nr);
}
EXPORT_SYMBOL_GPL(find_ge_pid);
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
{
struct fd f;
struct pid *pid;
f = fdget(fd);
if (!f.file)
return ERR_PTR(-EBADF);
pid = pidfd_pid(f.file);
if (!IS_ERR(pid)) {
get_pid(pid);
*flags = f.file->f_flags;
}
fdput(f);
return pid;
}
/**
* pidfd_get_task() - Get the task associated with a pidfd
*
* @pidfd: pidfd for which to get the task
* @flags: flags associated with this pidfd
*
* Return the task associated with @pidfd. The function takes a reference on
* the returned task. The caller is responsible for releasing that reference.
*
* Currently, the process identified by @pidfd is always a thread-group leader.
* This restriction currently exists for all aspects of pidfds including pidfd
* creation (CLONE_PIDFD cannot be used with CLONE_THREAD) and pidfd polling
* (only supports thread group leaders).
*
* Return: On success, the task_struct associated with the pidfd.
* On error, a negative errno number will be returned.
*/
struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
{
unsigned int f_flags;
struct pid *pid;
struct task_struct *task;
pid = pidfd_get_pid(pidfd, &f_flags);
if (IS_ERR(pid))
return ERR_CAST(pid);
task = get_pid_task(pid, PIDTYPE_TGID);
put_pid(pid);
if (!task)
return ERR_PTR(-ESRCH);
*flags = f_flags;
return task;
}
/**
* pidfd_create() - Create a new pid file descriptor.
*
* @pid: struct pid that the pidfd will reference
* @flags: flags to pass
*
* This creates a new pid file descriptor with the O_CLOEXEC flag set.
*
* Note, that this function can only be called after the fd table has
* been unshared to avoid leaking the pidfd to the new process.
*
* This symbol should not be explicitly exported to loadable modules.
*
* Return: On success, a cloexec pidfd is returned.
* On error, a negative errno number will be returned.
*/
int pidfd_create(struct pid *pid, unsigned int flags)
{
int pidfd;
struct file *pidfd_file;
pidfd = pidfd_prepare(pid, flags, &pidfd_file);
if (pidfd < 0)
return pidfd;
fd_install(pidfd, pidfd_file);
return pidfd;
}
/**
* pidfd_open() - Open new pid file descriptor.
*
* @pid: pid for which to retrieve a pidfd
* @flags: flags to pass
*
* This creates a new pid file descriptor with the O_CLOEXEC flag set for
* the process identified by @pid. Currently, the process identified by
* @pid must be a thread-group leader. This restriction currently exists
* for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
* be used with CLONE_THREAD) and pidfd polling (only supports thread group
* leaders).
*
* Return: On success, a cloexec pidfd is returned.
* On error, a negative errno number will be returned.
*/
SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
{
int fd;
struct pid *p;
if (flags & ~PIDFD_NONBLOCK)
return -EINVAL;
if (pid <= 0)
return -EINVAL;
p = find_get_pid(pid);
if (!p)
return -ESRCH;
fd = pidfd_create(p, flags);
put_pid(p);
return fd;
}
void __init pid_idr_init(void)
{
/* Verify no one has done anything silly: */
BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
/* bump default and minimum pid_max based on number of cpus */
pid_max = min(pid_max_max, max_t(int, pid_max,
PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
pid_max_min = max_t(int, pid_max_min,
PIDS_PER_CPU_MIN * num_possible_cpus());
pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
idr_init(&init_pid_ns.idr);
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
}
static struct file *__pidfd_fget(struct task_struct *task, int fd)
{
struct file *file;
int ret;
ret = down_read_killable(&task->signal->exec_update_lock);
if (ret)
return ERR_PTR(ret);
if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS))
file = fget_task(task, fd);
else
file = ERR_PTR(-EPERM);
up_read(&task->signal->exec_update_lock);
return file ?: ERR_PTR(-EBADF);
}
static int pidfd_getfd(struct pid *pid, int fd)
{
struct task_struct *task;
struct file *file;
int ret;
task = get_pid_task(pid, PIDTYPE_PID);
if (!task)
return -ESRCH;
file = __pidfd_fget(task, fd);
put_task_struct(task);
if (IS_ERR(file))
return PTR_ERR(file);
ret = receive_fd(file, O_CLOEXEC);
fput(file);
return ret;
}
/**
* sys_pidfd_getfd() - Get a file descriptor from another process
*
* @pidfd: the pidfd file descriptor of the process
* @fd: the file descriptor number to get
* @flags: flags on how to get the fd (reserved)
*
* This syscall gets a copy of a file descriptor from another process
* based on the pidfd, and file descriptor number. It requires that
* the calling process has the ability to ptrace the process represented
* by the pidfd. The process which is having its file descriptor copied
* is otherwise unaffected.
*
* Return: On success, a cloexec file descriptor is returned.
* On error, a negative errno number will be returned.
*/
SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd,
unsigned int, flags)
{
struct pid *pid;
struct fd f;
int ret;
/* flags is currently unused - make sure it's unset */
if (flags)
return -EINVAL;
f = fdget(pidfd);
if (!f.file)
return -EBADF;
pid = pidfd_pid(f.file);
if (IS_ERR(pid))
ret = PTR_ERR(pid);
else
ret = pidfd_getfd(pid, fd);
fdput(f);
return ret;
}