linux-kernelorg-stable/kernel/nscommon.c

// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */

#include <linux/ns_common.h>
#include <linux/proc_ns.h>
#include <linux/user_namespace.h>
#include <linux/vfsdebug.h>

#ifdef CONFIG_DEBUG_VFS
static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops)
{
	switch (ns->ns_type) {
#ifdef CONFIG_CGROUPS
	case CLONE_NEWCGROUP:
		VFS_WARN_ON_ONCE(ops != &cgroupns_operations);
		break;
#endif
#ifdef CONFIG_IPC_NS
	case CLONE_NEWIPC:
		VFS_WARN_ON_ONCE(ops != &ipcns_operations);
		break;
#endif
	case CLONE_NEWNS:
		VFS_WARN_ON_ONCE(ops != &mntns_operations);
		break;
#ifdef CONFIG_NET_NS
	case CLONE_NEWNET:
		VFS_WARN_ON_ONCE(ops != &netns_operations);
		break;
#endif
#ifdef CONFIG_PID_NS
	case CLONE_NEWPID:
		VFS_WARN_ON_ONCE(ops != &pidns_operations);
		break;
#endif
#ifdef CONFIG_TIME_NS
	case CLONE_NEWTIME:
		VFS_WARN_ON_ONCE(ops != &timens_operations);
		break;
#endif
#ifdef CONFIG_USER_NS
	case CLONE_NEWUSER:
		VFS_WARN_ON_ONCE(ops != &userns_operations);
		break;
#endif
#ifdef CONFIG_UTS_NS
	case CLONE_NEWUTS:
		VFS_WARN_ON_ONCE(ops != &utsns_operations);
		break;
#endif
	}
}
#endif

int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum)
{
	int ret = 0;

	refcount_set(&ns->__ns_ref, 1);
	ns->stashed = NULL;
	ns->ops = ops;
	ns->ns_id = 0;
	ns->ns_type = ns_type;
	RB_CLEAR_NODE(&ns->ns_tree_node);
	RB_CLEAR_NODE(&ns->ns_unified_tree_node);
	RB_CLEAR_NODE(&ns->ns_owner_tree_node);
	INIT_LIST_HEAD(&ns->ns_list_node);
	INIT_LIST_HEAD(&ns->ns_unified_list_node);
	ns->ns_owner_tree = RB_ROOT;
	INIT_LIST_HEAD(&ns->ns_owner);
	INIT_LIST_HEAD(&ns->ns_owner_entry);

#ifdef CONFIG_DEBUG_VFS
	ns_debug(ns, ops);
#endif

	if (inum)
		ns->inum = inum;
	else
		ret = proc_alloc_inum(&ns->inum);
	if (ret)
		return ret;
	/*
	 * Tree ref starts at 0. It's incremented when namespace enters
	 * active use (installed in nsproxy) and decremented when all
	 * active uses are gone. Initial namespaces are always active.
	 */
	if (is_initial_namespace(ns))
		atomic_set(&ns->__ns_ref_active, 1);
	else
		atomic_set(&ns->__ns_ref_active, 0);
	return 0;
}

void __ns_common_free(struct ns_common *ns)
{
	proc_free_inum(ns->inum);
}

struct ns_common *__must_check ns_owner(struct ns_common *ns)
{
	struct user_namespace *owner;

	if (unlikely(!ns->ops))
		return NULL;
	VFS_WARN_ON_ONCE(!ns->ops->owner);
	owner = ns->ops->owner(ns);
	VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns));
	if (!owner)
		return NULL;
	/* Skip init_user_ns as it's always active */
	if (owner == &init_user_ns)
		return NULL;
	return to_ns_common(owner);
}

/*
 * The active reference count works by having each namespace that gets
 * created take a single active reference on its owning user namespace.
 * That single reference is only released once the child namespace's
 * active count itself goes down.
 *
 * A regular namespace tree might look as follow:
 * Legend:
 * + : adding active reference
 * - : dropping active reference
 * x : always active (initial namespace)
 *
 *
 *                 net_ns          pid_ns
 *                       \        /
 *                        +      +
 *                        user_ns1 (2)
 *                            |
 *                 ipc_ns     |     uts_ns
 *                       \    |    /
 *                        +   +   +
 *                        user_ns2 (3)
 *                            |
 *            cgroup_ns       |       mnt_ns
 *                     \      |      /
 *                      x     x     x
 *                      init_user_ns (1)
 *
 * If both net_ns and pid_ns put their last active reference on
 * themselves it will cascade to user_ns1 dropping its own active
 * reference and dropping one active reference on user_ns2:
 *
 *                 net_ns          pid_ns
 *                       \        /
 *                        -      -
 *                        user_ns1 (0)
 *                            |
 *                 ipc_ns     |     uts_ns
 *                       \    |    /
 *                        +   -   +
 *                        user_ns2 (2)
 *                            |
 *            cgroup_ns       |       mnt_ns
 *                     \      |      /
 *                      x     x     x
 *                      init_user_ns (1)
 *
 * The iteration stops once we reach a namespace that still has active
 * references.
 */
void __ns_ref_active_put(struct ns_common *ns)
{
	/* Initial namespaces are always active. */
	if (is_ns_init_id(ns))
		return;

	if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
		VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
		return;
	}

	VFS_WARN_ON_ONCE(is_ns_init_id(ns));
	VFS_WARN_ON_ONCE(!__ns_ref_read(ns));

	for (;;) {
		ns = ns_owner(ns);
		if (!ns)
			return;
		VFS_WARN_ON_ONCE(is_ns_init_id(ns));
		if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
			VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
			return;
		}
	}
}

/*
 * The active reference count works by having each namespace that gets
 * created take a single active reference on its owning user namespace.
 * That single reference is only released once the child namespace's
 * active count itself goes down. This makes it possible to efficiently
 * resurrect a namespace tree:
 *
 * A regular namespace tree might look as follow:
 * Legend:
 * + : adding active reference
 * - : dropping active reference
 * x : always active (initial namespace)
 *
 *
 *                 net_ns          pid_ns
 *                       \        /
 *                        +      +
 *                        user_ns1 (2)
 *                            |
 *                 ipc_ns     |     uts_ns
 *                       \    |    /
 *                        +   +   +
 *                        user_ns2 (3)
 *                            |
 *            cgroup_ns       |       mnt_ns
 *                     \      |      /
 *                      x     x     x
 *                      init_user_ns (1)
 *
 * If both net_ns and pid_ns put their last active reference on
 * themselves it will cascade to user_ns1 dropping its own active
 * reference and dropping one active reference on user_ns2:
 *
 *                 net_ns          pid_ns
 *                       \        /
 *                        -      -
 *                        user_ns1 (0)
 *                            |
 *                 ipc_ns     |     uts_ns
 *                       \    |    /
 *                        +   -   +
 *                        user_ns2 (2)
 *                            |
 *            cgroup_ns       |       mnt_ns
 *                     \      |      /
 *                      x     x     x
 *                      init_user_ns (1)
 *
 * Assume the whole tree is dead but all namespaces are still active:
 *
 *                 net_ns          pid_ns
 *                       \        /
 *                        -      -
 *                        user_ns1 (0)
 *                            |
 *                 ipc_ns     |     uts_ns
 *                       \    |    /
 *                        -   -   -
 *                        user_ns2 (0)
 *                            |
 *            cgroup_ns       |       mnt_ns
 *                     \      |      /
 *                      x     x     x
 *                      init_user_ns (1)
 *
 * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()):
 *
 *                 net_ns          pid_ns
 *                       \        /
 *                        +      -
 *                        user_ns1 (0)
 *                            |
 *                 ipc_ns     |     uts_ns
 *                       \    |    /
 *                        -   +   -
 *                        user_ns2 (0)
 *                            |
 *            cgroup_ns       |       mnt_ns
 *                     \      |      /
 *                      x     x     x
 *                      init_user_ns (1)
 *
 * If net_ns had a zero reference count and we bumped it we also need to
 * take another reference on its owning user namespace. Similarly, if
 * pid_ns had a zero reference count it also needs to take another
 * reference on its owning user namespace. So both net_ns and pid_ns
 * will each have their own reference on the owning user namespace.
 *
 * If the owning user namespace user_ns1 had a zero reference count then
 * it also needs to take another reference on its owning user namespace
 * and so on.
 */
void __ns_ref_active_get(struct ns_common *ns)
{
	int prev;

	/* Initial namespaces are always active. */
	if (is_ns_init_id(ns))
		return;

	/* If we didn't resurrect the namespace we're done. */
	prev = atomic_fetch_add(1, &ns->__ns_ref_active);
	VFS_WARN_ON_ONCE(prev < 0);
	if (likely(prev))
		return;

	/*
	 * We did resurrect it. Walk the ownership hierarchy upwards
	 * until we found an owning user namespace that is active.
	 */
	for (;;) {
		ns = ns_owner(ns);
		if (!ns)
			return;

		VFS_WARN_ON_ONCE(is_ns_init_id(ns));
		prev = atomic_fetch_add(1, &ns->__ns_ref_active);
		VFS_WARN_ON_ONCE(prev < 0);
		if (likely(prev))
			return;
	}
}