Merge: cgroup: Backport upstream cgroup commits up to v6.12

MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/6581

JIRA: https://issues.redhat.com/browse/RHEL-80382

This MR backports upstream cgroup commits up to v6.12 with relevant fixes, if applicable.

Signed-off-by: Radostin Stoyanov <rstoyano@redhat.com>

Approved-by: Waiman Long <longman@redhat.com>
Approved-by: Rafael Aquini <raquini@redhat.com>
Approved-by: CKI KWF Bot <cki-ci-bot+kwf-gitlab-com@redhat.com>

Merged-by: Jan Stancek <jstancek@redhat.com>
This commit is contained in:
Jan Stancek 2025-05-26 10:33:48 +02:00
commit ce052eaf83
31 changed files with 2042 additions and 1405 deletions

View File

@ -533,10 +533,12 @@ cgroup namespace on namespace creation.
Because the resource control interface files in a given directory Because the resource control interface files in a given directory
control the distribution of the parent's resources, the delegatee control the distribution of the parent's resources, the delegatee
shouldn't be allowed to write to them. For the first method, this is shouldn't be allowed to write to them. For the first method, this is
achieved by not granting access to these files. For the second, the achieved by not granting access to these files. For the second, files
kernel rejects writes to all files other than "cgroup.procs" and outside the namespace should be hidden from the delegatee by the means
"cgroup.subtree_control" on a namespace root from inside the of at least mount namespacing, and the kernel rejects writes to all
namespace. files on a namespace root from inside the cgroup namespace, except for
those files listed in "/sys/kernel/cgroup/delegate" (including
"cgroup.procs", "cgroup.threads", "cgroup.subtree_control", etc.).
The end results are equivalent for both delegation types. Once The end results are equivalent for both delegation types. Once
delegated, the user can build sub-hierarchy under the directory, delegated, the user can build sub-hierarchy under the directory,
@ -1708,6 +1710,8 @@ PAGE_SIZE multiple when read back.
Note that this is subtly different from setting memory.swap.max to Note that this is subtly different from setting memory.swap.max to
0, as it still allows for pages to be written to the zswap pool. 0, as it still allows for pages to be written to the zswap pool.
This setting has no effect if zswap is disabled, and swapping
is allowed unless memory.swap.max is set to 0.
memory.pressure memory.pressure
A read-only nested-keyed file. A read-only nested-keyed file.

View File

@ -4928,9 +4928,12 @@ S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
F: Documentation/admin-guide/cgroup-v1/cpusets.rst F: Documentation/admin-guide/cgroup-v1/cpusets.rst
F: include/linux/cpuset.h F: include/linux/cpuset.h
F: kernel/cgroup/cpuset-internal.h
F: kernel/cgroup/cpuset-v1.c
F: kernel/cgroup/cpuset.c F: kernel/cgroup/cpuset.c
F: tools/testing/selftests/cgroup/test_cpuset.c F: tools/testing/selftests/cgroup/test_cpuset.c
F: tools/testing/selftests/cgroup/test_cpuset_prs.sh F: tools/testing/selftests/cgroup/test_cpuset_prs.sh
F: tools/testing/selftests/cgroup/test_cpuset_v1_base.sh
CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG) CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG)
M: Johannes Weiner <hannes@cmpxchg.org> M: Johannes Weiner <hannes@cmpxchg.org>

View File

@ -172,7 +172,11 @@ struct cgroup_subsys_state {
/* reference count - access via css_[try]get() and css_put() */ /* reference count - access via css_[try]get() and css_put() */
struct percpu_ref refcnt; struct percpu_ref refcnt;
/* siblings list anchored at the parent's ->children */ /*
* siblings list anchored at the parent's ->children
*
* linkage is protected by cgroup_mutex or RCU
*/
struct list_head sibling; struct list_head sibling;
struct list_head children; struct list_head children;
@ -323,6 +327,7 @@ struct cgroup_base_stat {
#ifdef CONFIG_SCHED_CORE #ifdef CONFIG_SCHED_CORE
u64 forceidle_sum; u64 forceidle_sum;
#endif #endif
u64 ntime;
}; };
/* /*

View File

@ -99,6 +99,7 @@ static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
const struct task_struct *tsk2); const struct task_struct *tsk2);
#ifdef CONFIG_CPUSETS_V1
#define cpuset_memory_pressure_bump() \ #define cpuset_memory_pressure_bump() \
do { \ do { \
if (cpuset_memory_pressure_enabled) \ if (cpuset_memory_pressure_enabled) \
@ -106,6 +107,9 @@ extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
} while (0) } while (0)
extern int cpuset_memory_pressure_enabled; extern int cpuset_memory_pressure_enabled;
extern void __cpuset_memory_pressure_bump(void); extern void __cpuset_memory_pressure_bump(void);
#else
static inline void cpuset_memory_pressure_bump(void) { }
#endif
extern void cpuset_task_status_allowed(struct seq_file *m, extern void cpuset_task_status_allowed(struct seq_file *m,
struct task_struct *task); struct task_struct *task);
@ -113,7 +117,6 @@ extern int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *tsk); struct pid *pid, struct task_struct *tsk);
extern int cpuset_mem_spread_node(void); extern int cpuset_mem_spread_node(void);
extern int cpuset_slab_spread_node(void);
static inline int cpuset_do_page_mem_spread(void) static inline int cpuset_do_page_mem_spread(void)
{ {
@ -251,11 +254,6 @@ static inline int cpuset_mem_spread_node(void)
return 0; return 0;
} }
static inline int cpuset_slab_spread_node(void)
{
return 0;
}
static inline int cpuset_do_page_mem_spread(void) static inline int cpuset_do_page_mem_spread(void)
{ {
return 0; return 0;

View File

@ -1433,7 +1433,6 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
struct page *page, unsigned int nr, unsigned long addr); struct page *page, unsigned int nr, unsigned long addr);
vm_fault_t finish_fault(struct vm_fault *vmf); vm_fault_t finish_fault(struct vm_fault *vmf);
vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
#endif #endif
/* /*
@ -1783,26 +1782,26 @@ static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
#define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid) #define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid)
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
static inline int page_cpupid_xchg_last(struct page *page, int cpupid) static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
{ {
return xchg(&page->_last_cpupid, cpupid & LAST_CPUPID_MASK); return xchg(&folio->_last_cpupid, cpupid & LAST_CPUPID_MASK);
} }
static inline int page_cpupid_last(struct page *page) static inline int folio_last_cpupid(struct folio *folio)
{ {
return page->_last_cpupid; return folio->_last_cpupid;
} }
static inline void page_cpupid_reset_last(struct page *page) static inline void page_cpupid_reset_last(struct page *page)
{ {
page->_last_cpupid = -1 & LAST_CPUPID_MASK; page->_last_cpupid = -1 & LAST_CPUPID_MASK;
} }
#else #else
static inline int page_cpupid_last(struct page *page) static inline int folio_last_cpupid(struct folio *folio)
{ {
return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
} }
extern int page_cpupid_xchg_last(struct page *page, int cpupid); int folio_xchg_last_cpupid(struct folio *folio, int cpupid);
static inline void page_cpupid_reset_last(struct page *page) static inline void page_cpupid_reset_last(struct page *page)
{ {
@ -1810,11 +1809,12 @@ static inline void page_cpupid_reset_last(struct page *page)
} }
#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
static inline int xchg_page_access_time(struct page *page, int time) static inline int folio_xchg_access_time(struct folio *folio, int time)
{ {
int last_time; int last_time;
last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS); last_time = folio_xchg_last_cpupid(folio,
time >> PAGE_ACCESS_TIME_BUCKETS);
return last_time << PAGE_ACCESS_TIME_BUCKETS; return last_time << PAGE_ACCESS_TIME_BUCKETS;
} }
@ -1828,19 +1828,19 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
} }
} }
#else /* !CONFIG_NUMA_BALANCING */ #else /* !CONFIG_NUMA_BALANCING */
static inline int page_cpupid_xchg_last(struct page *page, int cpupid) static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
{ {
return page_to_nid(page); /* XXX */ return folio_nid(folio); /* XXX */
} }
static inline int xchg_page_access_time(struct page *page, int time) static inline int folio_xchg_access_time(struct folio *folio, int time)
{ {
return 0; return 0;
} }
static inline int page_cpupid_last(struct page *page) static inline int folio_last_cpupid(struct folio *folio)
{ {
return page_to_nid(page); /* XXX */ return folio_nid(folio); /* XXX */
} }
static inline int cpupid_to_nid(int cpupid) static inline int cpupid_to_nid(int cpupid)

View File

@ -190,6 +190,10 @@ struct page {
not kmapped, ie. highmem) */ not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */ #endif /* WANT_PAGE_VIRTUAL */
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
int _last_cpupid;
#endif
#ifdef CONFIG_KMSAN #ifdef CONFIG_KMSAN
/* /*
* KMSAN metadata for this page: * KMSAN metadata for this page:
@ -201,10 +205,6 @@ struct page {
struct page *kmsan_shadow; struct page *kmsan_shadow;
struct page *kmsan_origin; struct page *kmsan_origin;
#endif #endif
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
int _last_cpupid;
#endif
} _struct_page_alignment; } _struct_page_alignment;
/* /*
@ -263,6 +263,8 @@ typedef struct {
* @_refcount: Do not access this member directly. Use folio_ref_count() * @_refcount: Do not access this member directly. Use folio_ref_count()
* to find how many references there are to this folio. * to find how many references there are to this folio.
* @memcg_data: Memory Control Group data. * @memcg_data: Memory Control Group data.
* @virtual: Virtual address in the kernel direct map.
* @_last_cpupid: IDs of last CPU and last process that accessed the folio.
* @_entire_mapcount: Do not use directly, call folio_entire_mapcount(). * @_entire_mapcount: Do not use directly, call folio_entire_mapcount().
* @_nr_pages_mapped: Do not use directly, call folio_mapcount(). * @_nr_pages_mapped: Do not use directly, call folio_mapcount().
* @_pincount: Do not use directly, call folio_maybe_dma_pinned(). * @_pincount: Do not use directly, call folio_maybe_dma_pinned().
@ -308,6 +310,12 @@ struct folio {
atomic_t _refcount; atomic_t _refcount;
#ifdef CONFIG_MEMCG #ifdef CONFIG_MEMCG
unsigned long memcg_data; unsigned long memcg_data;
#endif
#if defined(WANT_PAGE_VIRTUAL)
void *virtual;
#endif
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
int _last_cpupid;
#endif #endif
/* private: the union with struct page is transitional */ /* private: the union with struct page is transitional */
}; };
@ -364,6 +372,12 @@ FOLIO_MATCH(_refcount, _refcount);
#ifdef CONFIG_MEMCG #ifdef CONFIG_MEMCG
FOLIO_MATCH(memcg_data, memcg_data); FOLIO_MATCH(memcg_data, memcg_data);
#endif #endif
#if defined(WANT_PAGE_VIRTUAL)
FOLIO_MATCH(virtual, virtual);
#endif
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
FOLIO_MATCH(_last_cpupid, _last_cpupid);
#endif
#undef FOLIO_MATCH #undef FOLIO_MATCH
#define FOLIO_MATCH(pg, fl) \ #define FOLIO_MATCH(pg, fl) \
static_assert(offsetof(struct folio, fl) == \ static_assert(offsetof(struct folio, fl) == \

View File

@ -1244,7 +1244,6 @@ struct task_struct {
/* Sequence number to catch updates: */ /* Sequence number to catch updates: */
seqcount_spinlock_t mems_allowed_seq; seqcount_spinlock_t mems_allowed_seq;
int cpuset_mem_spread_rotor; int cpuset_mem_spread_rotor;
int cpuset_slab_spread_rotor;
#endif #endif
#ifdef CONFIG_CGROUPS #ifdef CONFIG_CGROUPS
/* Control Group info protected by css_set_lock: */ /* Control Group info protected by css_set_lock: */

View File

@ -1123,9 +1123,23 @@ config CPUSETS
Say N if unsure. Say N if unsure.
config CPUSETS_V1
bool "Legacy cgroup v1 cpusets controller"
depends on CPUSETS
default n
help
Legacy cgroup v1 cpusets controller which has been deprecated by
cgroup v2 implementation. The v1 is there for legacy applications
which haven't migrated to the new cgroup v2 interface yet. Legacy
interface includes cpuset filesystem and /proc/<pid>/cpuset. If you
do not have any such application then you are completely fine leaving
this option disabled.
Say N if unsure.
config PROC_PID_CPUSET config PROC_PID_CPUSET
bool "Include legacy /proc/<pid>/cpuset file" bool "Include legacy /proc/<pid>/cpuset file"
depends on CPUSETS depends on CPUSETS_V1
default y default y
config CGROUP_DEVICE config CGROUP_DEVICE

View File

@ -5,5 +5,6 @@ obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o
obj-$(CONFIG_CGROUP_PIDS) += pids.o obj-$(CONFIG_CGROUP_PIDS) += pids.o
obj-$(CONFIG_CGROUP_RDMA) += rdma.o obj-$(CONFIG_CGROUP_RDMA) += rdma.o
obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_CPUSETS_V1) += cpuset-v1.o
obj-$(CONFIG_CGROUP_MISC) += misc.o obj-$(CONFIG_CGROUP_MISC) += misc.o
obj-$(CONFIG_CGROUP_DEBUG) += debug.o obj-$(CONFIG_CGROUP_DEBUG) += debug.o

View File

@ -46,6 +46,12 @@ bool cgroup1_ssid_disabled(int ssid)
return cgroup_no_v1_mask & (1 << ssid); return cgroup_no_v1_mask & (1 << ssid);
} }
static bool cgroup1_subsys_absent(struct cgroup_subsys *ss)
{
/* Check also dfl_cftypes for file-less controllers, i.e. perf_event */
return ss->legacy_cftypes == NULL && ss->dfl_cftypes;
}
/** /**
* cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
* @from: attach to all cgroups of a given task * @from: attach to all cgroups of a given task
@ -675,11 +681,14 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
* cgroup_mutex contention. * cgroup_mutex contention.
*/ */
for_each_subsys(ss, i) for_each_subsys(ss, i) {
if (cgroup1_subsys_absent(ss))
continue;
seq_printf(m, "%s\t%d\t%d\t%d\n", seq_printf(m, "%s\t%d\t%d\t%d\n",
ss->legacy_name, ss->root->hierarchy_id, ss->legacy_name, ss->root->hierarchy_id,
atomic_read(&ss->root->nr_cgrps), atomic_read(&ss->root->nr_cgrps),
cgroup_ssid_enabled(i)); cgroup_ssid_enabled(i));
}
return 0; return 0;
} }
@ -932,7 +941,8 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
if (ret != -ENOPARAM) if (ret != -ENOPARAM)
return ret; return ret;
for_each_subsys(ss, i) { for_each_subsys(ss, i) {
if (strcmp(param->key, ss->legacy_name)) if (strcmp(param->key, ss->legacy_name) ||
cgroup1_subsys_absent(ss))
continue; continue;
if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i)) if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i))
return invalfc(fc, "Disabled controller '%s'", return invalfc(fc, "Disabled controller '%s'",
@ -1024,7 +1034,8 @@ static int check_cgroupfs_options(struct fs_context *fc)
mask = ~((u16)1 << cpuset_cgrp_id); mask = ~((u16)1 << cpuset_cgrp_id);
#endif #endif
for_each_subsys(ss, i) for_each_subsys(ss, i)
if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i)) if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i) &&
!cgroup1_subsys_absent(ss))
enabled |= 1 << i; enabled |= 1 << i;
ctx->subsys_mask &= enabled; ctx->subsys_mask &= enabled;

View File

@ -2331,7 +2331,7 @@ static struct file_system_type cgroup2_fs_type = {
.fs_flags = FS_USERNS_MOUNT, .fs_flags = FS_USERNS_MOUNT,
}; };
#ifdef CONFIG_CPUSETS #ifdef CONFIG_CPUSETS_V1
static const struct fs_context_operations cpuset_fs_context_ops = { static const struct fs_context_operations cpuset_fs_context_ops = {
.get_tree = cgroup1_get_tree, .get_tree = cgroup1_get_tree,
.free = cgroup_fs_context_free, .free = cgroup_fs_context_free,
@ -4124,7 +4124,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
* If namespaces are delegation boundaries, disallow writes to * If namespaces are delegation boundaries, disallow writes to
* files in an non-init namespace root from inside the namespace * files in an non-init namespace root from inside the namespace
* except for the files explicitly marked delegatable - * except for the files explicitly marked delegatable -
* cgroup.procs and cgroup.subtree_control. * eg. cgroup.procs, cgroup.threads and cgroup.subtree_control.
*/ */
if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
!(cft->flags & CFTYPE_NS_DELEGATABLE) && !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
@ -4623,8 +4623,9 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
* *
* While this function requires cgroup_mutex or RCU read locking, it * While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical * doesn't require the whole traversal to be contained in a single critical
* section. This function will return the correct next descendant as long * section. Additionally, it isn't necessary to hold onto a reference to @pos.
* as both @pos and @root are accessible and @pos is a descendant of @root. * This function will return the correct next descendant as long as both @pos
* and @root are accessible and @pos is a descendant of @root.
* *
* If a subsystem synchronizes ->css_online() and the start of iteration, a * If a subsystem synchronizes ->css_online() and the start of iteration, a
* css which finished ->css_online() is guaranteed to be visible in the * css which finished ->css_online() is guaranteed to be visible in the
@ -4672,8 +4673,9 @@ EXPORT_SYMBOL_GPL(css_next_descendant_pre);
* *
* While this function requires cgroup_mutex or RCU read locking, it * While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical * doesn't require the whole traversal to be contained in a single critical
* section. This function will return the correct rightmost descendant as * section. Additionally, it isn't necessary to hold onto a reference to @pos.
* long as @pos is accessible. * This function will return the correct rightmost descendant as long as @pos
* is accessible.
*/ */
struct cgroup_subsys_state * struct cgroup_subsys_state *
css_rightmost_descendant(struct cgroup_subsys_state *pos) css_rightmost_descendant(struct cgroup_subsys_state *pos)
@ -4717,9 +4719,9 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos)
* *
* While this function requires cgroup_mutex or RCU read locking, it * While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical * doesn't require the whole traversal to be contained in a single critical
* section. This function will return the correct next descendant as long * section. Additionally, it isn't necessary to hold onto a reference to @pos.
* as both @pos and @cgroup are accessible and @pos is a descendant of * This function will return the correct next descendant as long as both @pos
* @cgroup. * and @cgroup are accessible and @pos is a descendant of @cgroup.
* *
* If a subsystem synchronizes ->css_online() and the start of iteration, a * If a subsystem synchronizes ->css_online() and the start of iteration, a
* css which finished ->css_online() is guaranteed to be visible in the * css which finished ->css_online() is guaranteed to be visible in the
@ -5780,7 +5782,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
{ {
struct cgroup *cgroup; struct cgroup *cgroup;
int ret = false; int ret = false;
int level = 1; int level = 0;
lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&cgroup_mutex);
@ -5788,7 +5790,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
if (cgroup->nr_descendants >= cgroup->max_descendants) if (cgroup->nr_descendants >= cgroup->max_descendants)
goto fail; goto fail;
if (level > cgroup->max_depth) if (level >= cgroup->max_depth)
goto fail; goto fail;
level++; level++;
@ -6242,7 +6244,7 @@ int __init cgroup_init(void)
WARN_ON(register_filesystem(&cgroup_fs_type)); WARN_ON(register_filesystem(&cgroup_fs_type));
WARN_ON(register_filesystem(&cgroup2_fs_type)); WARN_ON(register_filesystem(&cgroup2_fs_type));
WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show)); WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
#ifdef CONFIG_CPUSETS #ifdef CONFIG_CPUSETS_V1
WARN_ON(register_filesystem(&cpuset_fs_type)); WARN_ON(register_filesystem(&cpuset_fs_type));
#endif #endif

View File

@ -0,0 +1,305 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef __CPUSET_INTERNAL_H
#define __CPUSET_INTERNAL_H
#include <linux/cgroup.h>
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/cpuset.h>
#include <linux/spinlock.h>
#include <linux/union_find.h>
/* See "Frequency meter" comments, below. */
struct fmeter {
int cnt; /* unprocessed events count */
int val; /* most recent output value */
time64_t time; /* clock (secs) when val computed */
spinlock_t lock; /* guards read or write of above */
};
/*
* Invalid partition error code
*/
enum prs_errcode {
PERR_NONE = 0,
PERR_INVCPUS,
PERR_INVPARENT,
PERR_NOTPART,
PERR_NOTEXCL,
PERR_NOCPUS,
PERR_HOTPLUG,
PERR_CPUSEMPTY,
PERR_HKEEPING,
PERR_ACCESS,
};
/* bits in struct cpuset flags field */
typedef enum {
CS_ONLINE,
CS_CPU_EXCLUSIVE,
CS_MEM_EXCLUSIVE,
CS_MEM_HARDWALL,
CS_MEMORY_MIGRATE,
CS_SCHED_LOAD_BALANCE,
CS_SPREAD_PAGE,
CS_SPREAD_SLAB,
} cpuset_flagbits_t;
/* The various types of files and directories in a cpuset file system */
typedef enum {
FILE_MEMORY_MIGRATE,
FILE_CPULIST,
FILE_MEMLIST,
FILE_EFFECTIVE_CPULIST,
FILE_EFFECTIVE_MEMLIST,
FILE_SUBPARTS_CPULIST,
FILE_EXCLUSIVE_CPULIST,
FILE_EFFECTIVE_XCPULIST,
FILE_ISOLATED_CPULIST,
FILE_CPU_EXCLUSIVE,
FILE_MEM_EXCLUSIVE,
FILE_MEM_HARDWALL,
FILE_SCHED_LOAD_BALANCE,
FILE_PARTITION_ROOT,
FILE_SCHED_RELAX_DOMAIN_LEVEL,
FILE_MEMORY_PRESSURE_ENABLED,
FILE_MEMORY_PRESSURE,
FILE_SPREAD_PAGE,
FILE_SPREAD_SLAB,
} cpuset_filetype_t;
struct cpuset {
struct cgroup_subsys_state css;
unsigned long flags; /* "unsigned long" so bitops work */
/*
* On default hierarchy:
*
* The user-configured masks can only be changed by writing to
* cpuset.cpus and cpuset.mems, and won't be limited by the
* parent masks.
*
* The effective masks is the real masks that apply to the tasks
* in the cpuset. They may be changed if the configured masks are
* changed or hotplug happens.
*
* effective_mask == configured_mask & parent's effective_mask,
* and if it ends up empty, it will inherit the parent's mask.
*
*
* On legacy hierarchy:
*
* The user-configured masks are always the same with effective masks.
*/
/* user-configured CPUs and Memory Nodes allow to tasks */
cpumask_var_t cpus_allowed;
nodemask_t mems_allowed;
/* effective CPUs and Memory Nodes allow to tasks */
cpumask_var_t effective_cpus;
nodemask_t effective_mems;
/*
* Exclusive CPUs dedicated to current cgroup (default hierarchy only)
*
* The effective_cpus of a valid partition root comes solely from its
* effective_xcpus and some of the effective_xcpus may be distributed
* to sub-partitions below & hence excluded from its effective_cpus.
* For a valid partition root, its effective_cpus have no relationship
* with cpus_allowed unless its exclusive_cpus isn't set.
*
* This value will only be set if either exclusive_cpus is set or
* when this cpuset becomes a local partition root.
*/
cpumask_var_t effective_xcpus;
/*
* Exclusive CPUs as requested by the user (default hierarchy only)
*
* Its value is independent of cpus_allowed and designates the set of
* CPUs that can be granted to the current cpuset or its children when
* it becomes a valid partition root. The effective set of exclusive
* CPUs granted (effective_xcpus) depends on whether those exclusive
* CPUs are passed down by its ancestors and not yet taken up by
* another sibling partition root along the way.
*
* If its value isn't set, it defaults to cpus_allowed.
*/
cpumask_var_t exclusive_cpus;
/*
* This is old Memory Nodes tasks took on.
*
* - top_cpuset.old_mems_allowed is initialized to mems_allowed.
* - A new cpuset's old_mems_allowed is initialized when some
* task is moved into it.
* - old_mems_allowed is used in cpuset_migrate_mm() when we change
* cpuset.mems_allowed and have tasks' nodemask updated, and
* then old_mems_allowed is updated to mems_allowed.
*/
nodemask_t old_mems_allowed;
struct fmeter fmeter; /* memory_pressure filter */
/*
* Tasks are being attached to this cpuset. Used to prevent
* zeroing cpus/mems_allowed between ->can_attach() and ->attach().
*/
int attach_in_progress;
/* for custom sched domain */
int relax_domain_level;
/* number of valid local child partitions */
int nr_subparts;
/* partition root state */
int partition_root_state;
/*
* number of SCHED_DEADLINE tasks attached to this cpuset, so that we
* know when to rebuild associated root domain bandwidth information.
*/
int nr_deadline_tasks;
int nr_migrate_dl_tasks;
u64 sum_migrate_dl_bw;
/* Invalid partition error code, not lock protected */
enum prs_errcode prs_err;
/* Handle for cpuset.cpus.partition */
struct cgroup_file partition_file;
/* Remote partition silbling list anchored at remote_children */
struct list_head remote_sibling;
/* Used to merge intersecting subsets for generate_sched_domains */
struct uf_node node;
};
static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
{
return css ? container_of(css, struct cpuset, css) : NULL;
}
/* Retrieve the cpuset for a task */
static inline struct cpuset *task_cs(struct task_struct *task)
{
return css_cs(task_css(task, cpuset_cgrp_id));
}
static inline struct cpuset *parent_cs(struct cpuset *cs)
{
return css_cs(cs->css.parent);
}
/* convenient tests for these bits */
static inline bool is_cpuset_online(struct cpuset *cs)
{
return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
}
static inline int is_cpu_exclusive(const struct cpuset *cs)
{
return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
}
static inline int is_mem_exclusive(const struct cpuset *cs)
{
return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
}
static inline int is_mem_hardwall(const struct cpuset *cs)
{
return test_bit(CS_MEM_HARDWALL, &cs->flags);
}
static inline int is_sched_load_balance(const struct cpuset *cs)
{
return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
}
static inline int is_memory_migrate(const struct cpuset *cs)
{
return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
}
static inline int is_spread_page(const struct cpuset *cs)
{
return test_bit(CS_SPREAD_PAGE, &cs->flags);
}
static inline int is_spread_slab(const struct cpuset *cs)
{
return test_bit(CS_SPREAD_SLAB, &cs->flags);
}
/**
* cpuset_for_each_child - traverse online children of a cpuset
* @child_cs: loop cursor pointing to the current child
* @pos_css: used for iteration
* @parent_cs: target cpuset to walk children of
*
* Walk @child_cs through the online children of @parent_cs. Must be used
* with RCU read locked.
*/
#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
css_for_each_child((pos_css), &(parent_cs)->css) \
if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
/**
* cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
* @des_cs: loop cursor pointing to the current descendant
* @pos_css: used for iteration
* @root_cs: target cpuset to walk ancestor of
*
* Walk @des_cs through the online descendants of @root_cs. Must be used
* with RCU read locked. The caller may modify @pos_css by calling
* css_rightmost_descendant() to skip subtree. @root_cs is included in the
* iteration and the first node to be visited.
*/
#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
void rebuild_sched_domains_locked(void);
void cpuset_callback_lock_irq(void);
void cpuset_callback_unlock_irq(void);
void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus);
void cpuset_update_tasks_nodemask(struct cpuset *cs);
int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on);
ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off);
int cpuset_common_seq_show(struct seq_file *sf, void *v);
/*
* cpuset-v1.c
*/
#ifdef CONFIG_CPUSETS_V1
extern struct cftype cpuset1_files[];
void fmeter_init(struct fmeter *fmp);
void cpuset1_update_task_spread_flags(struct cpuset *cs,
struct task_struct *tsk);
void cpuset1_update_tasks_flags(struct cpuset *cs);
void cpuset1_hotplug_update_tasks(struct cpuset *cs,
struct cpumask *new_cpus, nodemask_t *new_mems,
bool cpus_updated, bool mems_updated);
int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial);
#else
static inline void fmeter_init(struct fmeter *fmp) {}
static inline void cpuset1_update_task_spread_flags(struct cpuset *cs,
struct task_struct *tsk) {}
static inline void cpuset1_update_tasks_flags(struct cpuset *cs) {}
static inline void cpuset1_hotplug_update_tasks(struct cpuset *cs,
struct cpumask *new_cpus, nodemask_t *new_mems,
bool cpus_updated, bool mems_updated) {}
static inline int cpuset1_validate_change(struct cpuset *cur,
struct cpuset *trial) { return 0; }
#endif /* CONFIG_CPUSETS_V1 */
#endif /* __CPUSET_INTERNAL_H */

603
kernel/cgroup/cpuset-v1.c Normal file
View File

@ -0,0 +1,603 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include "cgroup-internal.h"
#include "cpuset-internal.h"
/*
* Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
*/
struct cpuset_remove_tasks_struct {
struct work_struct work;
struct cpuset *cs;
};
/*
* Frequency meter - How fast is some event occurring?
*
* These routines manage a digitally filtered, constant time based,
* event frequency meter. There are four routines:
* fmeter_init() - initialize a frequency meter.
* fmeter_markevent() - called each time the event happens.
* fmeter_getrate() - returns the recent rate of such events.
* fmeter_update() - internal routine used to update fmeter.
*
* A common data structure is passed to each of these routines,
* which is used to keep track of the state required to manage the
* frequency meter and its digital filter.
*
* The filter works on the number of events marked per unit time.
* The filter is single-pole low-pass recursive (IIR). The time unit
* is 1 second. Arithmetic is done using 32-bit integers scaled to
* simulate 3 decimal digits of precision (multiplied by 1000).
*
* With an FM_COEF of 933, and a time base of 1 second, the filter
* has a half-life of 10 seconds, meaning that if the events quit
* happening, then the rate returned from the fmeter_getrate()
* will be cut in half each 10 seconds, until it converges to zero.
*
* It is not worth doing a real infinitely recursive filter. If more
* than FM_MAXTICKS ticks have elapsed since the last filter event,
* just compute FM_MAXTICKS ticks worth, by which point the level
* will be stable.
*
* Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
* arithmetic overflow in the fmeter_update() routine.
*
* Given the simple 32 bit integer arithmetic used, this meter works
* best for reporting rates between one per millisecond (msec) and
* one per 32 (approx) seconds. At constant rates faster than one
* per msec it maxes out at values just under 1,000,000. At constant
* rates between one per msec, and one per second it will stabilize
* to a value N*1000, where N is the rate of events per second.
* At constant rates between one per second and one per 32 seconds,
* it will be choppy, moving up on the seconds that have an event,
* and then decaying until the next event. At rates slower than
* about one in 32 seconds, it decays all the way back to zero between
* each event.
*/
#define FM_COEF 933 /* coefficient for half-life of 10 secs */
#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
#define FM_SCALE 1000 /* faux fixed point scale */
/* Initialize a frequency meter */
void fmeter_init(struct fmeter *fmp)
{
fmp->cnt = 0;
fmp->val = 0;
fmp->time = 0;
spin_lock_init(&fmp->lock);
}
/* Internal meter update - process cnt events and update value */
static void fmeter_update(struct fmeter *fmp)
{
time64_t now;
u32 ticks;
now = ktime_get_seconds();
ticks = now - fmp->time;
if (ticks == 0)
return;
ticks = min(FM_MAXTICKS, ticks);
while (ticks-- > 0)
fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
fmp->time = now;
fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
fmp->cnt = 0;
}
/* Process any previous ticks, then bump cnt by one (times scale). */
static void fmeter_markevent(struct fmeter *fmp)
{
spin_lock(&fmp->lock);
fmeter_update(fmp);
fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
spin_unlock(&fmp->lock);
}
/* Process any previous ticks, then return current value. */
static int fmeter_getrate(struct fmeter *fmp)
{
int val;
spin_lock(&fmp->lock);
fmeter_update(fmp);
val = fmp->val;
spin_unlock(&fmp->lock);
return val;
}
/*
* Collection of memory_pressure is suppressed unless
* this flag is enabled by writing "1" to the special
* cpuset file 'memory_pressure_enabled' in the root cpuset.
*/
int cpuset_memory_pressure_enabled __read_mostly;
/*
* __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
*
* Keep a running average of the rate of synchronous (direct)
* page reclaim efforts initiated by tasks in each cpuset.
*
* This represents the rate at which some task in the cpuset
* ran low on memory on all nodes it was allowed to use, and
* had to enter the kernels page reclaim code in an effort to
* create more free memory by tossing clean pages or swapping
* or writing dirty pages.
*
* Display to user space in the per-cpuset read-only file
* "memory_pressure". Value displayed is an integer
* representing the recent rate of entry into the synchronous
* (direct) page reclaim by any task attached to the cpuset.
*/
void __cpuset_memory_pressure_bump(void)
{
rcu_read_lock();
fmeter_markevent(&task_cs(current)->fmeter);
rcu_read_unlock();
}
static int update_relax_domain_level(struct cpuset *cs, s64 val)
{
#ifdef CONFIG_SMP
if (val < -1 || val > sched_domain_level_max + 1)
return -EINVAL;
#endif
if (val != cs->relax_domain_level) {
cs->relax_domain_level = val;
if (!cpumask_empty(cs->cpus_allowed) &&
is_sched_load_balance(cs))
rebuild_sched_domains_locked();
}
return 0;
}
static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
s64 val)
{
struct cpuset *cs = css_cs(css);
cpuset_filetype_t type = cft->private;
int retval = -ENODEV;
cpus_read_lock();
cpuset_lock();
if (!is_cpuset_online(cs))
goto out_unlock;
switch (type) {
case FILE_SCHED_RELAX_DOMAIN_LEVEL:
retval = update_relax_domain_level(cs, val);
break;
default:
retval = -EINVAL;
break;
}
out_unlock:
cpuset_unlock();
cpus_read_unlock();
return retval;
}
static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
{
struct cpuset *cs = css_cs(css);
cpuset_filetype_t type = cft->private;
switch (type) {
case FILE_SCHED_RELAX_DOMAIN_LEVEL:
return cs->relax_domain_level;
default:
BUG();
}
/* Unreachable but makes gcc happy */
return 0;
}
/*
* update task's spread flag if cpuset's page/slab spread flag is set
*
* Call with callback_lock or cpuset_mutex held. The check can be skipped
* if on default hierarchy.
*/
void cpuset1_update_task_spread_flags(struct cpuset *cs,
struct task_struct *tsk)
{
if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
return;
if (is_spread_page(cs))
task_set_spread_page(tsk);
else
task_clear_spread_page(tsk);
if (is_spread_slab(cs))
task_set_spread_slab(tsk);
else
task_clear_spread_slab(tsk);
}
/**
* cpuset1_update_tasks_flags - update the spread flags of tasks in the cpuset.
* @cs: the cpuset in which each task's spread flags needs to be changed
*
* Iterate through each task of @cs updating its spread flags. As this
* function is called with cpuset_mutex held, cpuset membership stays
* stable.
*/
void cpuset1_update_tasks_flags(struct cpuset *cs)
{
struct css_task_iter it;
struct task_struct *task;
css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
cpuset1_update_task_spread_flags(cs, task);
css_task_iter_end(&it);
}
/*
* If CPU and/or memory hotplug handlers, below, unplug any CPUs
* or memory nodes, we need to walk over the cpuset hierarchy,
* removing that CPU or node from all cpusets. If this removes the
* last CPU or node from a cpuset, then move the tasks in the empty
* cpuset to its next-highest non-empty parent.
*/
static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
{
struct cpuset *parent;
/*
* Find its next-highest non-empty parent, (top cpuset
* has online cpus, so can't be empty).
*/
parent = parent_cs(cs);
while (cpumask_empty(parent->cpus_allowed) ||
nodes_empty(parent->mems_allowed))
parent = parent_cs(parent);
if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
pr_cont_cgroup_name(cs->css.cgroup);
pr_cont("\n");
}
}
static void cpuset_migrate_tasks_workfn(struct work_struct *work)
{
struct cpuset_remove_tasks_struct *s;
s = container_of(work, struct cpuset_remove_tasks_struct, work);
remove_tasks_in_empty_cpuset(s->cs);
css_put(&s->cs->css);
kfree(s);
}
void cpuset1_hotplug_update_tasks(struct cpuset *cs,
struct cpumask *new_cpus, nodemask_t *new_mems,
bool cpus_updated, bool mems_updated)
{
bool is_empty;
cpuset_callback_lock_irq();
cpumask_copy(cs->cpus_allowed, new_cpus);
cpumask_copy(cs->effective_cpus, new_cpus);
cs->mems_allowed = *new_mems;
cs->effective_mems = *new_mems;
cpuset_callback_unlock_irq();
/*
* Don't call cpuset_update_tasks_cpumask() if the cpuset becomes empty,
* as the tasks will be migrated to an ancestor.
*/
if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
cpuset_update_tasks_cpumask(cs, new_cpus);
if (mems_updated && !nodes_empty(cs->mems_allowed))
cpuset_update_tasks_nodemask(cs);
is_empty = cpumask_empty(cs->cpus_allowed) ||
nodes_empty(cs->mems_allowed);
/*
* Move tasks to the nearest ancestor with execution resources,
* This is full cgroup operation which will also call back into
* cpuset. Execute it asynchronously using workqueue.
*/
if (is_empty && cs->css.cgroup->nr_populated_csets &&
css_tryget_online(&cs->css)) {
struct cpuset_remove_tasks_struct *s;
s = kzalloc(sizeof(*s), GFP_KERNEL);
if (WARN_ON_ONCE(!s)) {
css_put(&cs->css);
return;
}
s->cs = cs;
INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
schedule_work(&s->work);
}
}
/*
* is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
*
* One cpuset is a subset of another if all its allowed CPUs and
* Memory Nodes are a subset of the other, and its exclusive flags
* are only set if the other's are set. Call holding cpuset_mutex.
*/
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
{
return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
nodes_subset(p->mems_allowed, q->mems_allowed) &&
is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
is_mem_exclusive(p) <= is_mem_exclusive(q);
}
/*
* cpuset1_validate_change() - Validate conditions specific to legacy (v1)
* behavior.
*/
int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial)
{
struct cgroup_subsys_state *css;
struct cpuset *c, *par;
int ret;
WARN_ON_ONCE(!rcu_read_lock_held());
/* Each of our child cpusets must be a subset of us */
ret = -EBUSY;
cpuset_for_each_child(c, css, cur)
if (!is_cpuset_subset(c, trial))
goto out;
/* On legacy hierarchy, we must be a subset of our parent cpuset. */
ret = -EACCES;
par = parent_cs(cur);
if (par && !is_cpuset_subset(trial, par))
goto out;
ret = 0;
out:
return ret;
}
#ifdef CONFIG_PROC_PID_CPUSET
/*
* proc_cpuset_show()
* - Print tasks cpuset path into seq_file.
* - Used for /proc/<pid>/cpuset.
*/
int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *tsk)
{
char *buf;
struct cgroup_subsys_state *css;
int retval;
retval = -ENOMEM;
buf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!buf)
goto out;
rcu_read_lock();
spin_lock_irq(&css_set_lock);
css = task_css(tsk, cpuset_cgrp_id);
retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
current->nsproxy->cgroup_ns);
spin_unlock_irq(&css_set_lock);
rcu_read_unlock();
if (retval == -E2BIG)
retval = -ENAMETOOLONG;
if (retval < 0)
goto out_free;
seq_puts(m, buf);
seq_putc(m, '\n');
retval = 0;
out_free:
kfree(buf);
out:
return retval;
}
#endif /* CONFIG_PROC_PID_CPUSET */
static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
{
struct cpuset *cs = css_cs(css);
cpuset_filetype_t type = cft->private;
switch (type) {
case FILE_CPU_EXCLUSIVE:
return is_cpu_exclusive(cs);
case FILE_MEM_EXCLUSIVE:
return is_mem_exclusive(cs);
case FILE_MEM_HARDWALL:
return is_mem_hardwall(cs);
case FILE_SCHED_LOAD_BALANCE:
return is_sched_load_balance(cs);
case FILE_MEMORY_MIGRATE:
return is_memory_migrate(cs);
case FILE_MEMORY_PRESSURE_ENABLED:
return cpuset_memory_pressure_enabled;
case FILE_MEMORY_PRESSURE:
return fmeter_getrate(&cs->fmeter);
case FILE_SPREAD_PAGE:
return is_spread_page(cs);
case FILE_SPREAD_SLAB:
return is_spread_slab(cs);
default:
BUG();
}
/* Unreachable but makes gcc happy */
return 0;
}
static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
u64 val)
{
struct cpuset *cs = css_cs(css);
cpuset_filetype_t type = cft->private;
int retval = 0;
cpus_read_lock();
cpuset_lock();
if (!is_cpuset_online(cs)) {
retval = -ENODEV;
goto out_unlock;
}
switch (type) {
case FILE_CPU_EXCLUSIVE:
retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val);
break;
case FILE_MEM_EXCLUSIVE:
retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val);
break;
case FILE_MEM_HARDWALL:
retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val);
break;
case FILE_SCHED_LOAD_BALANCE:
retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
break;
case FILE_MEMORY_MIGRATE:
retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val);
break;
case FILE_MEMORY_PRESSURE_ENABLED:
cpuset_memory_pressure_enabled = !!val;
break;
case FILE_SPREAD_PAGE:
retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val);
break;
case FILE_SPREAD_SLAB:
retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val);
break;
default:
retval = -EINVAL;
break;
}
out_unlock:
cpuset_unlock();
cpus_read_unlock();
return retval;
}
/*
* for the common functions, 'private' gives the type of file
*/
struct cftype cpuset1_files[] = {
{
.name = "cpus",
.seq_show = cpuset_common_seq_show,
.write = cpuset_write_resmask,
.max_write_len = (100U + 6 * NR_CPUS),
.private = FILE_CPULIST,
},
{
.name = "mems",
.seq_show = cpuset_common_seq_show,
.write = cpuset_write_resmask,
.max_write_len = (100U + 6 * MAX_NUMNODES),
.private = FILE_MEMLIST,
},
{
.name = "effective_cpus",
.seq_show = cpuset_common_seq_show,
.private = FILE_EFFECTIVE_CPULIST,
},
{
.name = "effective_mems",
.seq_show = cpuset_common_seq_show,
.private = FILE_EFFECTIVE_MEMLIST,
},
{
.name = "cpu_exclusive",
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
.private = FILE_CPU_EXCLUSIVE,
},
{
.name = "mem_exclusive",
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
.private = FILE_MEM_EXCLUSIVE,
},
{
.name = "mem_hardwall",
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
.private = FILE_MEM_HARDWALL,
},
{
.name = "sched_load_balance",
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
.private = FILE_SCHED_LOAD_BALANCE,
},
{
.name = "sched_relax_domain_level",
.read_s64 = cpuset_read_s64,
.write_s64 = cpuset_write_s64,
.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
},
{
.name = "memory_migrate",
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
.private = FILE_MEMORY_MIGRATE,
},
{
.name = "memory_pressure",
.read_u64 = cpuset_read_u64,
.private = FILE_MEMORY_PRESSURE,
},
{
.name = "memory_spread_page",
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
.private = FILE_SPREAD_PAGE,
},
{
/* obsolete, may be removed in the future */
.name = "memory_spread_slab",
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
.private = FILE_SPREAD_SLAB,
},
{
.name = "memory_pressure_enabled",
.flags = CFTYPE_ONLY_ON_ROOT,
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
.private = FILE_MEMORY_PRESSURE_ENABLED,
},
{ } /* terminate */
};

File diff suppressed because it is too large Load Diff

View File

@ -272,15 +272,10 @@ static void pids_event(struct pids_cgroup *pids_forking,
*/ */
static int pids_can_fork(struct task_struct *task, struct css_set *cset) static int pids_can_fork(struct task_struct *task, struct css_set *cset)
{ {
struct cgroup_subsys_state *css;
struct pids_cgroup *pids, *pids_over_limit; struct pids_cgroup *pids, *pids_over_limit;
int err; int err;
if (cset) pids = css_pids(cset->subsys[pids_cgrp_id]);
css = cset->subsys[pids_cgrp_id];
else
css = task_css_check(current, pids_cgrp_id, true);
pids = css_pids(css);
err = pids_try_charge(pids, 1, &pids_over_limit); err = pids_try_charge(pids, 1, &pids_over_limit);
if (err) if (err)
pids_event(pids, pids_over_limit); pids_event(pids, pids_over_limit);
@ -290,14 +285,9 @@ static int pids_can_fork(struct task_struct *task, struct css_set *cset)
static void pids_cancel_fork(struct task_struct *task, struct css_set *cset) static void pids_cancel_fork(struct task_struct *task, struct css_set *cset)
{ {
struct cgroup_subsys_state *css;
struct pids_cgroup *pids; struct pids_cgroup *pids;
if (cset) pids = css_pids(cset->subsys[pids_cgrp_id]);
css = cset->subsys[pids_cgrp_id];
else
css = task_css_check(current, pids_cgrp_id, true);
pids = css_pids(css);
pids_uncharge(pids, 1); pids_uncharge(pids, 1);
} }

View File

@ -444,6 +444,7 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
#ifdef CONFIG_SCHED_CORE #ifdef CONFIG_SCHED_CORE
dst_bstat->forceidle_sum += src_bstat->forceidle_sum; dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
#endif #endif
dst_bstat->ntime += src_bstat->ntime;
} }
static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
@ -455,6 +456,7 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
#ifdef CONFIG_SCHED_CORE #ifdef CONFIG_SCHED_CORE
dst_bstat->forceidle_sum -= src_bstat->forceidle_sum; dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
#endif #endif
dst_bstat->ntime -= src_bstat->ntime;
} }
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
@ -534,8 +536,10 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
switch (index) { switch (index) {
case CPUTIME_USER:
case CPUTIME_NICE: case CPUTIME_NICE:
rstatc->bstat.ntime += delta_exec;
fallthrough;
case CPUTIME_USER:
rstatc->bstat.cputime.utime += delta_exec; rstatc->bstat.cputime.utime += delta_exec;
break; break;
case CPUTIME_SYSTEM: case CPUTIME_SYSTEM:
@ -590,6 +594,7 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
#ifdef CONFIG_SCHED_CORE #ifdef CONFIG_SCHED_CORE
bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE]; bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
#endif #endif
bstat->ntime += cpustat[CPUTIME_NICE];
} }
} }
@ -607,13 +612,14 @@ static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat
void cgroup_base_stat_cputime_show(struct seq_file *seq) void cgroup_base_stat_cputime_show(struct seq_file *seq)
{ {
struct cgroup *cgrp = seq_css(seq)->cgroup; struct cgroup *cgrp = seq_css(seq)->cgroup;
u64 usage, utime, stime; u64 usage, utime, stime, ntime;
if (cgroup_parent(cgrp)) { if (cgroup_parent(cgrp)) {
cgroup_rstat_flush_hold(cgrp); cgroup_rstat_flush_hold(cgrp);
usage = cgrp->bstat.cputime.sum_exec_runtime; usage = cgrp->bstat.cputime.sum_exec_runtime;
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
&utime, &stime); &utime, &stime);
ntime = cgrp->bstat.ntime;
cgroup_rstat_flush_release(cgrp); cgroup_rstat_flush_release(cgrp);
} else { } else {
/* cgrp->bstat of root is not actually used, reuse it */ /* cgrp->bstat of root is not actually used, reuse it */
@ -621,16 +627,19 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
usage = cgrp->bstat.cputime.sum_exec_runtime; usage = cgrp->bstat.cputime.sum_exec_runtime;
utime = cgrp->bstat.cputime.utime; utime = cgrp->bstat.cputime.utime;
stime = cgrp->bstat.cputime.stime; stime = cgrp->bstat.cputime.stime;
ntime = cgrp->bstat.ntime;
} }
do_div(usage, NSEC_PER_USEC); do_div(usage, NSEC_PER_USEC);
do_div(utime, NSEC_PER_USEC); do_div(utime, NSEC_PER_USEC);
do_div(stime, NSEC_PER_USEC); do_div(stime, NSEC_PER_USEC);
do_div(ntime, NSEC_PER_USEC);
seq_printf(seq, "usage_usec %llu\n" seq_printf(seq, "usage_usec %llu\n"
"user_usec %llu\n" "user_usec %llu\n"
"system_usec %llu\n", "system_usec %llu\n"
usage, utime, stime); "nice_usec %llu\n",
usage, utime, stime, ntime);
cgroup_force_idle_show(seq, &cgrp->bstat); cgroup_force_idle_show(seq, &cgrp->bstat);
} }

View File

@ -2377,7 +2377,6 @@ static __latent_entropy struct task_struct *copy_process(
#endif #endif
#ifdef CONFIG_CPUSETS #ifdef CONFIG_CPUSETS
p->cpuset_mem_spread_rotor = NUMA_NO_NODE; p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock); seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
#endif #endif
#ifdef CONFIG_TRACE_IRQFLAGS #ifdef CONFIG_TRACE_IRQFLAGS

View File

@ -1535,7 +1535,7 @@ static int numa_hint_fault_latency(struct folio *folio)
int last_time, time; int last_time, time;
time = jiffies_to_msecs(jiffies); time = jiffies_to_msecs(jiffies);
last_time = xchg_page_access_time(&folio->page, time); last_time = folio_xchg_access_time(folio, time);
return (time - last_time) & PAGE_ACCESS_TIME_MASK; return (time - last_time) & PAGE_ACCESS_TIME_MASK;
} }
@ -1637,7 +1637,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
} }
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
last_cpupid = page_cpupid_xchg_last(&folio->page, this_cpupid); last_cpupid = folio_xchg_last_cpupid(folio, this_cpupid);
if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
!node_is_toptier(src_nid) && !cpupid_valid(last_cpupid)) !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))

View File

@ -29,6 +29,7 @@ KCOV_INSTRUMENT_page_alloc.o := n
KCOV_INSTRUMENT_debug-pagealloc.o := n KCOV_INSTRUMENT_debug-pagealloc.o := n
KCOV_INSTRUMENT_kmemleak.o := n KCOV_INSTRUMENT_kmemleak.o := n
KCOV_INSTRUMENT_memcontrol.o := n KCOV_INSTRUMENT_memcontrol.o := n
KCOV_INSTRUMENT_memcontrol-v1.o := n
KCOV_INSTRUMENT_mmzone.o := n KCOV_INSTRUMENT_mmzone.o := n
KCOV_INSTRUMENT_vmstat.o := n KCOV_INSTRUMENT_vmstat.o := n
KCOV_INSTRUMENT_failslab.o := n KCOV_INSTRUMENT_failslab.o := n
@ -96,7 +97,7 @@ obj-$(CONFIG_NUMA) += memory-tiers.o
obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o obj-$(CONFIG_MEMCG) += memcontrol.o memcontrol-v1.o vmpressure.o
ifdef CONFIG_SWAP ifdef CONFIG_SWAP
obj-$(CONFIG_MEMCG) += swap_cgroup.o obj-$(CONFIG_MEMCG) += swap_cgroup.o
endif endif

View File

@ -1560,7 +1560,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
*/ */
if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) || if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) ||
node_is_toptier(nid)) node_is_toptier(nid))
last_cpupid = page_cpupid_last(&folio->page); last_cpupid = folio_last_cpupid(folio);
target_nid = numa_migrate_prep(folio, vma, haddr, nid, &flags); target_nid = numa_migrate_prep(folio, vma, haddr, nid, &flags);
if (target_nid == NUMA_NO_NODE) { if (target_nid == NUMA_NO_NODE) {
folio_put(folio); folio_put(folio);
@ -1863,7 +1863,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
if (is_swap_pmd(*pmd)) { if (is_swap_pmd(*pmd)) {
swp_entry_t entry = pmd_to_swp_entry(*pmd); swp_entry_t entry = pmd_to_swp_entry(*pmd);
struct page *page = pfn_swap_entry_to_page(entry); struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
pmd_t newpmd; pmd_t newpmd;
VM_BUG_ON(!is_pmd_migration_entry(*pmd)); VM_BUG_ON(!is_pmd_migration_entry(*pmd));
@ -1872,7 +1872,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
* A protection check is difficult so * A protection check is difficult so
* just be safe and disable write * just be safe and disable write
*/ */
if (PageAnon(page)) if (folio_test_anon(folio))
entry = make_readable_exclusive_migration_entry(swp_offset(entry)); entry = make_readable_exclusive_migration_entry(swp_offset(entry));
else else
entry = make_readable_migration_entry(swp_offset(entry)); entry = make_readable_migration_entry(swp_offset(entry));
@ -1894,7 +1894,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
#endif #endif
if (prot_numa) { if (prot_numa) {
struct page *page; struct folio *folio;
bool toptier; bool toptier;
/* /*
* Avoid trapping faults against the zero page. The read-only * Avoid trapping faults against the zero page. The read-only
@ -1907,8 +1907,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (pmd_protnone(*pmd)) if (pmd_protnone(*pmd))
goto unlock; goto unlock;
page = pmd_page(*pmd); folio = page_folio(pmd_page(*pmd));
toptier = node_is_toptier(page_to_nid(page)); toptier = node_is_toptier(folio_nid(folio));
/* /*
* Skip scanning top tier node if normal numa * Skip scanning top tier node if normal numa
* balancing is disabled * balancing is disabled
@ -1919,7 +1919,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
!toptier) !toptier)
xchg_page_access_time(page, jiffies_to_msecs(jiffies)); folio_xchg_access_time(folio,
jiffies_to_msecs(jiffies));
} }
/* /*
* In case prot_numa, we are under mmap_read_lock(mm). It's critical * In case prot_numa, we are under mmap_read_lock(mm). It's critical
@ -2526,7 +2527,7 @@ static void __split_huge_page_tail(struct folio *folio, int tail,
if (page_is_idle(head)) if (page_is_idle(head))
set_page_idle(page_tail); set_page_idle(page_tail);
page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio));
/* /*
* always add to the tail because some iterators expect new * always add to the tail because some iterators expect new

345
mm/memcontrol-v1.c Normal file
View File

@ -0,0 +1,345 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <linux/memcontrol.h>
#include <linux/swap.h>
#include <linux/mm_inline.h>
#include "memcontrol-v1.h"
/*
* Cgroups above their limits are maintained in a RB-Tree, independent of
* their hierarchy representation
*/
struct mem_cgroup_tree_per_node {
struct rb_root rb_root;
struct rb_node *rb_rightmost;
spinlock_t lock;
};
struct mem_cgroup_tree {
struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
};
static struct mem_cgroup_tree soft_limit_tree __read_mostly;
/*
* Maximum loops in mem_cgroup_soft_reclaim(), used for soft
* limit reclaim to prevent infinite loops, if they ever occur.
*/
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
struct mem_cgroup_tree_per_node *mctz,
unsigned long new_usage_in_excess)
{
struct rb_node **p = &mctz->rb_root.rb_node;
struct rb_node *parent = NULL;
struct mem_cgroup_per_node *mz_node;
bool rightmost = true;
if (mz->on_tree)
return;
mz->usage_in_excess = new_usage_in_excess;
if (!mz->usage_in_excess)
return;
while (*p) {
parent = *p;
mz_node = rb_entry(parent, struct mem_cgroup_per_node,
tree_node);
if (mz->usage_in_excess < mz_node->usage_in_excess) {
p = &(*p)->rb_left;
rightmost = false;
} else {
p = &(*p)->rb_right;
}
}
if (rightmost)
mctz->rb_rightmost = &mz->tree_node;
rb_link_node(&mz->tree_node, parent, p);
rb_insert_color(&mz->tree_node, &mctz->rb_root);
mz->on_tree = true;
}
static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
struct mem_cgroup_tree_per_node *mctz)
{
if (!mz->on_tree)
return;
if (&mz->tree_node == mctz->rb_rightmost)
mctz->rb_rightmost = rb_prev(&mz->tree_node);
rb_erase(&mz->tree_node, &mctz->rb_root);
mz->on_tree = false;
}
static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
struct mem_cgroup_tree_per_node *mctz)
{
unsigned long flags;
spin_lock_irqsave(&mctz->lock, flags);
__mem_cgroup_remove_exceeded(mz, mctz);
spin_unlock_irqrestore(&mctz->lock, flags);
}
static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
{
unsigned long nr_pages = page_counter_read(&memcg->memory);
unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
unsigned long excess = 0;
if (nr_pages > soft_limit)
excess = nr_pages - soft_limit;
return excess;
}
void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
{
unsigned long excess;
struct mem_cgroup_per_node *mz;
struct mem_cgroup_tree_per_node *mctz;
if (lru_gen_enabled()) {
if (soft_limit_excess(memcg))
lru_gen_soft_reclaim(memcg, nid);
return;
}
mctz = soft_limit_tree.rb_tree_per_node[nid];
if (!mctz)
return;
/*
* Necessary to update all ancestors when hierarchy is used.
* because their event counter is not touched.
*/
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
mz = memcg->nodeinfo[nid];
excess = soft_limit_excess(memcg);
/*
* We have to update the tree if mz is on RB-tree or
* mem is over its softlimit.
*/
if (excess || mz->on_tree) {
unsigned long flags;
spin_lock_irqsave(&mctz->lock, flags);
/* if on-tree, remove it */
if (mz->on_tree)
__mem_cgroup_remove_exceeded(mz, mctz);
/*
* Insert again. mz->usage_in_excess will be updated.
* If excess is 0, no tree ops.
*/
__mem_cgroup_insert_exceeded(mz, mctz, excess);
spin_unlock_irqrestore(&mctz->lock, flags);
}
}
}
void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
{
struct mem_cgroup_tree_per_node *mctz;
struct mem_cgroup_per_node *mz;
int nid;
for_each_node(nid) {
mz = memcg->nodeinfo[nid];
mctz = soft_limit_tree.rb_tree_per_node[nid];
if (mctz)
mem_cgroup_remove_exceeded(mz, mctz);
}
}
static struct mem_cgroup_per_node *
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
{
struct mem_cgroup_per_node *mz;
retry:
mz = NULL;
if (!mctz->rb_rightmost)
goto done; /* Nothing to reclaim from */
mz = rb_entry(mctz->rb_rightmost,
struct mem_cgroup_per_node, tree_node);
/*
* Remove the node now but someone else can add it back,
* we will to add it back at the end of reclaim to its correct
* position in the tree.
*/
__mem_cgroup_remove_exceeded(mz, mctz);
if (!soft_limit_excess(mz->memcg) ||
!css_tryget(&mz->memcg->css))
goto retry;
done:
return mz;
}
static struct mem_cgroup_per_node *
mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
{
struct mem_cgroup_per_node *mz;
spin_lock_irq(&mctz->lock);
mz = __mem_cgroup_largest_soft_limit_node(mctz);
spin_unlock_irq(&mctz->lock);
return mz;
}
static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
pg_data_t *pgdat,
gfp_t gfp_mask,
unsigned long *total_scanned)
{
struct mem_cgroup *victim = NULL;
int total = 0;
int loop = 0;
unsigned long excess;
unsigned long nr_scanned;
struct mem_cgroup_reclaim_cookie reclaim = {
.pgdat = pgdat,
};
excess = soft_limit_excess(root_memcg);
while (1) {
victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
if (!victim) {
loop++;
if (loop >= 2) {
/*
* If we have not been able to reclaim
* anything, it might because there are
* no reclaimable pages under this hierarchy
*/
if (!total)
break;
/*
* We want to do more targeted reclaim.
* excess >> 2 is not to excessive so as to
* reclaim too much, nor too less that we keep
* coming back to reclaim from this cgroup
*/
if (total >= (excess >> 2) ||
(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
break;
}
continue;
}
total += mem_cgroup_shrink_node(victim, gfp_mask, false,
pgdat, &nr_scanned);
*total_scanned += nr_scanned;
if (!soft_limit_excess(root_memcg))
break;
}
mem_cgroup_iter_break(root_memcg, victim);
return total;
}
unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
gfp_t gfp_mask,
unsigned long *total_scanned)
{
unsigned long nr_reclaimed = 0;
struct mem_cgroup_per_node *mz, *next_mz = NULL;
unsigned long reclaimed;
int loop = 0;
struct mem_cgroup_tree_per_node *mctz;
unsigned long excess;
if (lru_gen_enabled())
return 0;
if (order > 0)
return 0;
mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];
/*
* Do not even bother to check the largest node if the root
* is empty. Do it lockless to prevent lock bouncing. Races
* are acceptable as soft limit is best effort anyway.
*/
if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
return 0;
/*
* This loop can run a while, specially if mem_cgroup's continuously
* keep exceeding their soft limit and putting the system under
* pressure
*/
do {
if (next_mz)
mz = next_mz;
else
mz = mem_cgroup_largest_soft_limit_node(mctz);
if (!mz)
break;
reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
gfp_mask, total_scanned);
nr_reclaimed += reclaimed;
spin_lock_irq(&mctz->lock);
/*
* If we failed to reclaim anything from this memory cgroup
* it is time to move on to the next cgroup
*/
next_mz = NULL;
if (!reclaimed)
next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
excess = soft_limit_excess(mz->memcg);
/*
* One school of thought says that we should not add
* back the node to the tree if reclaim returns 0.
* But our reclaim could return 0, simply because due
* to priority we are exposing a smaller subset of
* memory to reclaim from. Consider this as a longer
* term TODO.
*/
/* If excess == 0, no tree ops */
__mem_cgroup_insert_exceeded(mz, mctz, excess);
spin_unlock_irq(&mctz->lock);
css_put(&mz->memcg->css);
loop++;
/*
* Could not reclaim anything and there are no more
* mem cgroups to try or we seem to be looping without
* reclaiming anything.
*/
if (!nr_reclaimed &&
(next_mz == NULL ||
loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
break;
} while (!nr_reclaimed);
if (next_mz)
css_put(&next_mz->memcg->css);
return nr_reclaimed;
}
static int __init memcg1_init(void)
{
int node;
for_each_node(node) {
struct mem_cgroup_tree_per_node *rtpn;
rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node);
rtpn->rb_root = RB_ROOT;
rtpn->rb_rightmost = NULL;
spin_lock_init(&rtpn->lock);
soft_limit_tree.rb_tree_per_node[node] = rtpn;
}
return 0;
}
subsys_initcall(memcg1_init);

14
mm/memcontrol-v1.h Normal file
View File

@ -0,0 +1,14 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef __MM_MEMCONTROL_V1_H
#define __MM_MEMCONTROL_V1_H
void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid);
void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg);
static inline void memcg1_soft_limit_reset(struct mem_cgroup *memcg)
{
WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX);
}
#endif /* __MM_MEMCONTROL_V1_H */

View File

@ -70,6 +70,7 @@
#include <net/ip.h> #include <net/ip.h>
#include "slab.h" #include "slab.h"
#include "swap.h" #include "swap.h"
#include "memcontrol-v1.h"
#include <linux/zswap.h> #include <linux/zswap.h>
#include <linux/uaccess.h> #include <linux/uaccess.h>
@ -107,23 +108,6 @@ static bool do_memsw_account(void)
#define THRESHOLDS_EVENTS_TARGET 128 #define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024 #define SOFTLIMIT_EVENTS_TARGET 1024
/*
* Cgroups above their limits are maintained in a RB-Tree, independent of
* their hierarchy representation
*/
struct mem_cgroup_tree_per_node {
struct rb_root rb_root;
struct rb_node *rb_rightmost;
spinlock_t lock;
};
struct mem_cgroup_tree {
struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
};
static struct mem_cgroup_tree soft_limit_tree __read_mostly;
/* for OOM */ /* for OOM */
struct mem_cgroup_eventfd_list { struct mem_cgroup_eventfd_list {
struct list_head list; struct list_head list;
@ -198,13 +182,6 @@ static struct move_charge_struct {
.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
}; };
/*
* Maximum loops in mem_cgroup_soft_reclaim(), used for soft
* limit reclaim to prevent infinite loops, if they ever occur.
*/
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
/* for encoding cft->private value on file */ /* for encoding cft->private value on file */
enum res_type { enum res_type {
_MEM, _MEM,
@ -420,169 +397,6 @@ ino_t page_cgroup_ino(struct page *page)
return ino; return ino;
} }
static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
struct mem_cgroup_tree_per_node *mctz,
unsigned long new_usage_in_excess)
{
struct rb_node **p = &mctz->rb_root.rb_node;
struct rb_node *parent = NULL;
struct mem_cgroup_per_node *mz_node;
bool rightmost = true;
if (mz->on_tree)
return;
mz->usage_in_excess = new_usage_in_excess;
if (!mz->usage_in_excess)
return;
while (*p) {
parent = *p;
mz_node = rb_entry(parent, struct mem_cgroup_per_node,
tree_node);
if (mz->usage_in_excess < mz_node->usage_in_excess) {
p = &(*p)->rb_left;
rightmost = false;
} else {
p = &(*p)->rb_right;
}
}
if (rightmost)
mctz->rb_rightmost = &mz->tree_node;
rb_link_node(&mz->tree_node, parent, p);
rb_insert_color(&mz->tree_node, &mctz->rb_root);
mz->on_tree = true;
}
static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
struct mem_cgroup_tree_per_node *mctz)
{
if (!mz->on_tree)
return;
if (&mz->tree_node == mctz->rb_rightmost)
mctz->rb_rightmost = rb_prev(&mz->tree_node);
rb_erase(&mz->tree_node, &mctz->rb_root);
mz->on_tree = false;
}
static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
struct mem_cgroup_tree_per_node *mctz)
{
unsigned long flags;
spin_lock_irqsave(&mctz->lock, flags);
__mem_cgroup_remove_exceeded(mz, mctz);
spin_unlock_irqrestore(&mctz->lock, flags);
}
static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
{
unsigned long nr_pages = page_counter_read(&memcg->memory);
unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
unsigned long excess = 0;
if (nr_pages > soft_limit)
excess = nr_pages - soft_limit;
return excess;
}
static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
{
unsigned long excess;
struct mem_cgroup_per_node *mz;
struct mem_cgroup_tree_per_node *mctz;
if (lru_gen_enabled()) {
if (soft_limit_excess(memcg))
lru_gen_soft_reclaim(memcg, nid);
return;
}
mctz = soft_limit_tree.rb_tree_per_node[nid];
if (!mctz)
return;
/*
* Necessary to update all ancestors when hierarchy is used.
* because their event counter is not touched.
*/
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
mz = memcg->nodeinfo[nid];
excess = soft_limit_excess(memcg);
/*
* We have to update the tree if mz is on RB-tree or
* mem is over its softlimit.
*/
if (excess || mz->on_tree) {
unsigned long flags;
spin_lock_irqsave(&mctz->lock, flags);
/* if on-tree, remove it */
if (mz->on_tree)
__mem_cgroup_remove_exceeded(mz, mctz);
/*
* Insert again. mz->usage_in_excess will be updated.
* If excess is 0, no tree ops.
*/
__mem_cgroup_insert_exceeded(mz, mctz, excess);
spin_unlock_irqrestore(&mctz->lock, flags);
}
}
}
static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
{
struct mem_cgroup_tree_per_node *mctz;
struct mem_cgroup_per_node *mz;
int nid;
for_each_node(nid) {
mz = memcg->nodeinfo[nid];
mctz = soft_limit_tree.rb_tree_per_node[nid];
if (mctz)
mem_cgroup_remove_exceeded(mz, mctz);
}
}
static struct mem_cgroup_per_node *
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
{
struct mem_cgroup_per_node *mz;
retry:
mz = NULL;
if (!mctz->rb_rightmost)
goto done; /* Nothing to reclaim from */
mz = rb_entry(mctz->rb_rightmost,
struct mem_cgroup_per_node, tree_node);
/*
* Remove the node now but someone else can add it back,
* we will to add it back at the end of reclaim to its correct
* position in the tree.
*/
__mem_cgroup_remove_exceeded(mz, mctz);
if (!soft_limit_excess(mz->memcg) ||
!css_tryget(&mz->memcg->css))
goto retry;
done:
return mz;
}
static struct mem_cgroup_per_node *
mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
{
struct mem_cgroup_per_node *mz;
spin_lock_irq(&mctz->lock);
mz = __mem_cgroup_largest_soft_limit_node(mctz);
spin_unlock_irq(&mctz->lock);
return mz;
}
/* /*
* memcg and lruvec stats flushing * memcg and lruvec stats flushing
* *
@ -1846,56 +1660,6 @@ unlock:
return ret; return ret;
} }
static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
pg_data_t *pgdat,
gfp_t gfp_mask,
unsigned long *total_scanned)
{
struct mem_cgroup *victim = NULL;
int total = 0;
int loop = 0;
unsigned long excess;
unsigned long nr_scanned;
struct mem_cgroup_reclaim_cookie reclaim = {
.pgdat = pgdat,
};
excess = soft_limit_excess(root_memcg);
while (1) {
victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
if (!victim) {
loop++;
if (loop >= 2) {
/*
* If we have not been able to reclaim
* anything, it might because there are
* no reclaimable pages under this hierarchy
*/
if (!total)
break;
/*
* We want to do more targeted reclaim.
* excess >> 2 is not to excessive so as to
* reclaim too much, nor too less that we keep
* coming back to reclaim from this cgroup
*/
if (total >= (excess >> 2) ||
(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
break;
}
continue;
}
total += mem_cgroup_shrink_node(victim, gfp_mask, false,
pgdat, &nr_scanned);
*total_scanned += nr_scanned;
if (!soft_limit_excess(root_memcg))
break;
}
mem_cgroup_iter_break(root_memcg, victim);
return total;
}
#ifdef CONFIG_LOCKDEP #ifdef CONFIG_LOCKDEP
static struct lockdep_map memcg_oom_lock_dep_map = { static struct lockdep_map memcg_oom_lock_dep_map = {
.name = "memcg_oom_lock", .name = "memcg_oom_lock",
@ -3744,88 +3508,6 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
return ret; return ret;
} }
unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
gfp_t gfp_mask,
unsigned long *total_scanned)
{
unsigned long nr_reclaimed = 0;
struct mem_cgroup_per_node *mz, *next_mz = NULL;
unsigned long reclaimed;
int loop = 0;
struct mem_cgroup_tree_per_node *mctz;
unsigned long excess;
if (lru_gen_enabled())
return 0;
if (order > 0)
return 0;
mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];
/*
* Do not even bother to check the largest node if the root
* is empty. Do it lockless to prevent lock bouncing. Races
* are acceptable as soft limit is best effort anyway.
*/
if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
return 0;
/*
* This loop can run a while, specially if mem_cgroup's continuously
* keep exceeding their soft limit and putting the system under
* pressure
*/
do {
if (next_mz)
mz = next_mz;
else
mz = mem_cgroup_largest_soft_limit_node(mctz);
if (!mz)
break;
reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
gfp_mask, total_scanned);
nr_reclaimed += reclaimed;
spin_lock_irq(&mctz->lock);
/*
* If we failed to reclaim anything from this memory cgroup
* it is time to move on to the next cgroup
*/
next_mz = NULL;
if (!reclaimed)
next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
excess = soft_limit_excess(mz->memcg);
/*
* One school of thought says that we should not add
* back the node to the tree if reclaim returns 0.
* But our reclaim could return 0, simply because due
* to priority we are exposing a smaller subset of
* memory to reclaim from. Consider this as a longer
* term TODO.
*/
/* If excess == 0, no tree ops */
__mem_cgroup_insert_exceeded(mz, mctz, excess);
spin_unlock_irq(&mctz->lock);
css_put(&mz->memcg->css);
loop++;
/*
* Could not reclaim anything and there are no more
* mem cgroups to try or we seem to be looping without
* reclaiming anything.
*/
if (!nr_reclaimed &&
(next_mz == NULL ||
loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
break;
} while (!nr_reclaimed);
if (next_mz)
css_put(&next_mz->memcg->css);
return nr_reclaimed;
}
/* /*
* Reclaims as many pages from the given memcg as possible. * Reclaims as many pages from the given memcg as possible.
* *
@ -5668,7 +5350,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
return ERR_CAST(memcg); return ERR_CAST(memcg);
page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX); memcg1_soft_limit_reset(memcg);
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
memcg->zswap_max = PAGE_COUNTER_MAX; memcg->zswap_max = PAGE_COUNTER_MAX;
WRITE_ONCE(memcg->zswap_writeback, true); WRITE_ONCE(memcg->zswap_writeback, true);
@ -5841,7 +5523,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
page_counter_set_min(&memcg->memory, 0); page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0);
page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX); memcg1_soft_limit_reset(memcg);
page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
memcg_wb_domain_size_changed(memcg); memcg_wb_domain_size_changed(memcg);
} }
@ -7810,7 +7492,7 @@ __setup("cgroup.memory=", cgroup_memory);
*/ */
static int __init mem_cgroup_init(void) static int __init mem_cgroup_init(void)
{ {
int cpu, node; int cpu;
/* /*
* Currently s32 type (can refer to struct batched_lruvec_stat) is * Currently s32 type (can refer to struct batched_lruvec_stat) is
@ -7827,17 +7509,6 @@ static int __init mem_cgroup_init(void)
INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
drain_local_stock); drain_local_stock);
for_each_node(node) {
struct mem_cgroup_tree_per_node *rtpn;
rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node);
rtpn->rb_root = RB_ROOT;
rtpn->rb_rightmost = NULL;
spin_lock_init(&rtpn->lock);
soft_limit_tree.rb_tree_per_node[node] = rtpn;
}
return 0; return 0;
} }
subsys_initcall(mem_cgroup_init); subsys_initcall(mem_cgroup_init);

View File

@ -3036,23 +3036,24 @@ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
* case, all we need to do here is to mark the page as writable and update * case, all we need to do here is to mark the page as writable and update
* any related book-keeping. * any related book-keeping.
*/ */
static inline void wp_page_reuse(struct vm_fault *vmf) static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio)
__releases(vmf->ptl) __releases(vmf->ptl)
{ {
struct vm_area_struct *vma = vmf->vma; struct vm_area_struct *vma = vmf->vma;
struct page *page = vmf->page;
pte_t entry; pte_t entry;
VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE)); VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE));
VM_BUG_ON(page && PageAnon(page) && !PageAnonExclusive(page));
/* if (folio) {
* Clear the pages cpupid information as the existing VM_BUG_ON(folio_test_anon(folio) &&
* information potentially belongs to a now completely !PageAnonExclusive(vmf->page));
* unrelated process. /*
*/ * Clear the folio's cpupid information as the existing
if (page) * information potentially belongs to a now completely
page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); * unrelated process.
*/
folio_xchg_last_cpupid(folio, (1 << LAST_CPUPID_SHIFT) - 1);
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = pte_mkyoung(vmf->orig_pte); entry = pte_mkyoung(vmf->orig_pte);
@ -3245,6 +3246,7 @@ oom:
* writeable once the page is prepared * writeable once the page is prepared
* *
* @vmf: structure describing the fault * @vmf: structure describing the fault
* @folio: the folio of vmf->page
* *
* This function handles all that is needed to finish a write page fault in a * This function handles all that is needed to finish a write page fault in a
* shared mapping due to PTE being read-only once the mapped page is prepared. * shared mapping due to PTE being read-only once the mapped page is prepared.
@ -3256,7 +3258,7 @@ oom:
* Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before
* we acquired PTE lock. * we acquired PTE lock.
*/ */
vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf, struct folio *folio)
{ {
WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
@ -3272,7 +3274,7 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl); pte_unmap_unlock(vmf->pte, vmf->ptl);
return VM_FAULT_NOPAGE; return VM_FAULT_NOPAGE;
} }
wp_page_reuse(vmf); wp_page_reuse(vmf, folio);
return 0; return 0;
} }
@ -3297,9 +3299,9 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
ret = vma->vm_ops->pfn_mkwrite(vmf); ret = vma->vm_ops->pfn_mkwrite(vmf);
if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
return ret; return ret;
return finish_mkwrite_fault(vmf); return finish_mkwrite_fault(vmf, NULL);
} }
wp_page_reuse(vmf); wp_page_reuse(vmf, NULL);
return 0; return 0;
} }
@ -3327,14 +3329,14 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio)
folio_put(folio); folio_put(folio);
return tmp; return tmp;
} }
tmp = finish_mkwrite_fault(vmf); tmp = finish_mkwrite_fault(vmf, folio);
if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
folio_unlock(folio); folio_unlock(folio);
folio_put(folio); folio_put(folio);
return tmp; return tmp;
} }
} else { } else {
wp_page_reuse(vmf); wp_page_reuse(vmf, folio);
folio_lock(folio); folio_lock(folio);
} }
ret |= fault_dirty_shared_page(vmf); ret |= fault_dirty_shared_page(vmf);
@ -3458,7 +3460,7 @@ reuse:
pte_unmap_unlock(vmf->pte, vmf->ptl); pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0; return 0;
} }
wp_page_reuse(vmf); wp_page_reuse(vmf, folio);
return 0; return 0;
} }
copy: copy:
@ -4866,7 +4868,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
!node_is_toptier(nid)) !node_is_toptier(nid))
last_cpupid = (-1 & LAST_CPUPID_MASK); last_cpupid = (-1 & LAST_CPUPID_MASK);
else else
last_cpupid = page_cpupid_last(&folio->page); last_cpupid = folio_last_cpupid(folio);
target_nid = numa_migrate_prep(folio, vma, vmf->address, nid, &flags); target_nid = numa_migrate_prep(folio, vma, vmf->address, nid, &flags);
if (target_nid == NUMA_NO_NODE) { if (target_nid == NUMA_NO_NODE) {
folio_put(folio); folio_put(folio);

View File

@ -606,20 +606,20 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
* Copy NUMA information to the new page, to prevent over-eager * Copy NUMA information to the new page, to prevent over-eager
* future migrations of this same page. * future migrations of this same page.
*/ */
cpupid = page_cpupid_xchg_last(&folio->page, -1); cpupid = folio_xchg_last_cpupid(folio, -1);
/* /*
* For memory tiering mode, when migrate between slow and fast * For memory tiering mode, when migrate between slow and fast
* memory node, reset cpupid, because that is used to record * memory node, reset cpupid, because that is used to record
* page access time in slow memory node. * page access time in slow memory node.
*/ */
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) { if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) {
bool f_toptier = node_is_toptier(page_to_nid(&folio->page)); bool f_toptier = node_is_toptier(folio_nid(folio));
bool t_toptier = node_is_toptier(page_to_nid(&newfolio->page)); bool t_toptier = node_is_toptier(folio_nid(newfolio));
if (f_toptier != t_toptier) if (f_toptier != t_toptier)
cpupid = -1; cpupid = -1;
} }
page_cpupid_xchg_last(&newfolio->page, cpupid); folio_xchg_last_cpupid(newfolio, cpupid);
folio_migrate_ksm(newfolio, folio); folio_migrate_ksm(newfolio, folio);
/* /*

View File

@ -94,19 +94,19 @@ void lruvec_init(struct lruvec *lruvec)
} }
#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
int page_cpupid_xchg_last(struct page *page, int cpupid) int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
{ {
unsigned long old_flags, flags; unsigned long old_flags, flags;
int last_cpupid; int last_cpupid;
old_flags = READ_ONCE(page->flags); old_flags = READ_ONCE(folio->flags);
do { do {
flags = old_flags; flags = old_flags;
last_cpupid = (flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; last_cpupid = (flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
} while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); } while (unlikely(!try_cmpxchg(&folio->flags, &old_flags, flags)));
return last_cpupid; return last_cpupid;
} }

View File

@ -114,7 +114,7 @@ static long change_pte_range(struct mmu_gather *tlb,
* pages. See similar comment in change_huge_pmd. * pages. See similar comment in change_huge_pmd.
*/ */
if (prot_numa) { if (prot_numa) {
struct page *page; struct folio *folio;
int nid; int nid;
bool toptier; bool toptier;
@ -122,13 +122,14 @@ static long change_pte_range(struct mmu_gather *tlb,
if (pte_protnone(oldpte)) if (pte_protnone(oldpte))
continue; continue;
page = vm_normal_page(vma, addr, oldpte); folio = vm_normal_folio(vma, addr, oldpte);
if (!page || is_zone_device_page(page) || PageKsm(page)) if (!folio || folio_is_zone_device(folio) ||
folio_test_ksm(folio))
continue; continue;
/* Also skip shared copy-on-write pages */ /* Also skip shared copy-on-write pages */
if (is_cow_mapping(vma->vm_flags) && if (is_cow_mapping(vma->vm_flags) &&
page_count(page) != 1) folio_ref_count(folio) != 1)
continue; continue;
/* /*
@ -136,14 +137,15 @@ static long change_pte_range(struct mmu_gather *tlb,
* it cannot move them all from MIGRATE_ASYNC * it cannot move them all from MIGRATE_ASYNC
* context. * context.
*/ */
if (page_is_file_lru(page) && PageDirty(page)) if (folio_is_file_lru(folio) &&
folio_test_dirty(folio))
continue; continue;
/* /*
* Don't mess with PTEs if page is already on the node * Don't mess with PTEs if page is already on the node
* a single-threaded process is running on. * a single-threaded process is running on.
*/ */
nid = page_to_nid(page); nid = folio_nid(folio);
if (target_node == nid) if (target_node == nid)
continue; continue;
toptier = node_is_toptier(nid); toptier = node_is_toptier(nid);
@ -157,7 +159,7 @@ static long change_pte_range(struct mmu_gather *tlb,
continue; continue;
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
!toptier) !toptier)
xchg_page_access_time(page, folio_xchg_access_time(folio,
jiffies_to_msecs(jiffies)); jiffies_to_msecs(jiffies));
} }

View File

@ -0,0 +1 @@
CONFIG_CPUSETS_V1=y

View File

@ -8,6 +8,7 @@
#include <pthread.h> #include <pthread.h>
#include <stdio.h> #include <stdio.h>
#include <time.h> #include <time.h>
#include <unistd.h>
#include "../kselftest.h" #include "../kselftest.h"
#include "cgroup_util.h" #include "cgroup_util.h"
@ -229,6 +230,79 @@ cleanup:
return ret; return ret;
} }
/*
* Creates a nice process that consumes CPU and checks that the elapsed
* usertime in the cgroup is close to the expected time.
*/
static int test_cpucg_nice(const char *root)
{
int ret = KSFT_FAIL;
int status;
long user_usec, nice_usec;
long usage_seconds = 2;
long expected_nice_usec = usage_seconds * USEC_PER_SEC;
char *cpucg;
pid_t pid;
cpucg = cg_name(root, "cpucg_test");
if (!cpucg)
goto cleanup;
if (cg_create(cpucg))
goto cleanup;
user_usec = cg_read_key_long(cpucg, "cpu.stat", "user_usec");
nice_usec = cg_read_key_long(cpucg, "cpu.stat", "nice_usec");
if (nice_usec == -1)
ret = KSFT_SKIP;
if (user_usec != 0 || nice_usec != 0)
goto cleanup;
/*
* We fork here to create a new process that can be niced without
* polluting the nice value of other selftests
*/
pid = fork();
if (pid < 0) {
goto cleanup;
} else if (pid == 0) {
struct cpu_hog_func_param param = {
.nprocs = 1,
.ts = {
.tv_sec = usage_seconds,
.tv_nsec = 0,
},
.clock_type = CPU_HOG_CLOCK_PROCESS,
};
char buf[64];
snprintf(buf, sizeof(buf), "%d", getpid());
if (cg_write(cpucg, "cgroup.procs", buf))
goto cleanup;
/* Try to keep niced CPU usage as constrained to hog_cpu as possible */
nice(1);
hog_cpus_timed(cpucg, &param);
exit(0);
} else {
waitpid(pid, &status, 0);
if (!WIFEXITED(status))
goto cleanup;
user_usec = cg_read_key_long(cpucg, "cpu.stat", "user_usec");
nice_usec = cg_read_key_long(cpucg, "cpu.stat", "nice_usec");
if (!values_close(nice_usec, expected_nice_usec, 1))
goto cleanup;
ret = KSFT_PASS;
}
cleanup:
cg_destroy(cpucg);
free(cpucg);
return ret;
}
static int static int
run_cpucg_weight_test( run_cpucg_weight_test(
const char *root, const char *root,
@ -686,6 +760,7 @@ struct cpucg_test {
} tests[] = { } tests[] = {
T(test_cpucg_subtree_control), T(test_cpucg_subtree_control),
T(test_cpucg_stats), T(test_cpucg_stats),
T(test_cpucg_nice),
T(test_cpucg_weight_overprovisioned), T(test_cpucg_weight_overprovisioned),
T(test_cpucg_weight_underprovisioned), T(test_cpucg_weight_underprovisioned),
T(test_cpucg_nested_weight_overprovisioned), T(test_cpucg_nested_weight_overprovisioned),

View File

@ -0,0 +1,77 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# Basc test for cpuset v1 interfaces write/read
#
skip_test() {
echo "$1"
echo "Test SKIPPED"
exit 4 # ksft_skip
}
write_test() {
dir=$1
interface=$2
value=$3
original=$(cat $dir/$interface)
echo "testing $interface $value"
echo $value > $dir/$interface
new=$(cat $dir/$interface)
[[ $value -ne $(cat $dir/$interface) ]] && {
echo "$interface write $value failed: new:$new"
exit 1
}
}
[[ $(id -u) -eq 0 ]] || skip_test "Test must be run as root!"
# Find cpuset v1 mount point
CPUSET=$(mount -t cgroup | grep cpuset | head -1 | awk '{print $3}')
[[ -n "$CPUSET" ]] || skip_test "cpuset v1 mount point not found!"
#
# Create a test cpuset, read write test
#
TDIR=test$$
[[ -d $CPUSET/$TDIR ]] || mkdir $CPUSET/$TDIR
ITF_MATRIX=(
#interface value expect root_only
'cpuset.cpus 0-1 0-1 0'
'cpuset.mem_exclusive 1 1 0'
'cpuset.mem_exclusive 0 0 0'
'cpuset.mem_hardwall 1 1 0'
'cpuset.mem_hardwall 0 0 0'
'cpuset.memory_migrate 1 1 0'
'cpuset.memory_migrate 0 0 0'
'cpuset.memory_spread_page 1 1 0'
'cpuset.memory_spread_page 0 0 0'
'cpuset.memory_spread_slab 1 1 0'
'cpuset.memory_spread_slab 0 0 0'
'cpuset.mems 0 0 0'
'cpuset.sched_load_balance 1 1 0'
'cpuset.sched_load_balance 0 0 0'
'cpuset.sched_relax_domain_level 2 2 0'
'cpuset.memory_pressure_enabled 1 1 1'
'cpuset.memory_pressure_enabled 0 0 1'
)
run_test()
{
cnt="${ITF_MATRIX[@]}"
for i in "${ITF_MATRIX[@]}" ; do
args=($i)
root_only=${args[3]}
[[ $root_only -eq 1 ]] && {
write_test "$CPUSET" "${args[0]}" "${args[1]}" "${args[2]}"
continue
}
write_test "$CPUSET/$TDIR" "${args[0]}" "${args[1]}" "${args[2]}"
done
}
run_test
rmdir $CPUSET/$TDIR
echo "Test PASSED"
exit 0

View File

@ -50,9 +50,35 @@ static int get_zswap_stored_pages(size_t *value)
return read_int("/sys/kernel/debug/zswap/stored_pages", value); return read_int("/sys/kernel/debug/zswap/stored_pages", value);
} }
static int get_zswap_written_back_pages(size_t *value) static long get_cg_wb_count(const char *cg)
{ {
return read_int("/sys/kernel/debug/zswap/written_back_pages", value); return cg_read_key_long(cg, "memory.stat", "zswpwb");
}
static long get_zswpout(const char *cgroup)
{
return cg_read_key_long(cgroup, "memory.stat", "zswpout ");
}
static int allocate_and_read_bytes(const char *cgroup, void *arg)
{
size_t size = (size_t)arg;
char *mem = (char *)malloc(size);
int ret = 0;
if (!mem)
return -1;
for (int i = 0; i < size; i += 4095)
mem[i] = 'a';
/* Go through the allocated memory to (z)swap in and out pages */
for (int i = 0; i < size; i += 4095) {
if (mem[i] != 'a')
ret = -1;
}
free(mem);
return ret;
} }
static int allocate_bytes(const char *cgroup, void *arg) static int allocate_bytes(const char *cgroup, void *arg)
@ -68,21 +94,33 @@ static int allocate_bytes(const char *cgroup, void *arg)
return 0; return 0;
} }
/* static char *setup_test_group_1M(const char *root, const char *name)
* When trying to store a memcg page in zswap, if the memcg hits its memory
* limit in zswap, writeback should not be triggered.
*
* This was fixed with commit 0bdf0efa180a("zswap: do not shrink if cgroup may
* not zswap"). Needs to be revised when a per memcg writeback mechanism is
* implemented.
*/
static int test_no_invasive_cgroup_shrink(const char *root)
{ {
size_t written_back_before, written_back_after; char *group_name = cg_name(root, name);
if (!group_name)
return NULL;
if (cg_create(group_name))
goto fail;
if (cg_write(group_name, "memory.max", "1M")) {
cg_destroy(group_name);
goto fail;
}
return group_name;
fail:
free(group_name);
return NULL;
}
/*
* Sanity test to check that pages are written into zswap.
*/
static int test_zswap_usage(const char *root)
{
long zswpout_before, zswpout_after;
int ret = KSFT_FAIL; int ret = KSFT_FAIL;
char *test_group; char *test_group;
/* Set up */
test_group = cg_name(root, "no_shrink_test"); test_group = cg_name(root, "no_shrink_test");
if (!test_group) if (!test_group)
goto out; goto out;
@ -90,26 +128,334 @@ static int test_no_invasive_cgroup_shrink(const char *root)
goto out; goto out;
if (cg_write(test_group, "memory.max", "1M")) if (cg_write(test_group, "memory.max", "1M"))
goto out; goto out;
if (cg_write(test_group, "memory.zswap.max", "10K"))
zswpout_before = get_zswpout(test_group);
if (zswpout_before < 0) {
ksft_print_msg("Failed to get zswpout\n");
goto out; goto out;
if (get_zswap_written_back_pages(&written_back_before)) }
/* Allocate more than memory.max to push memory into zswap */
if (cg_run(test_group, allocate_bytes, (void *)MB(4)))
goto out; goto out;
/* Allocate 10x memory.max to push memory into zswap */ /* Verify that pages come into zswap */
if (cg_run(test_group, allocate_bytes, (void *)MB(10))) zswpout_after = get_zswpout(test_group);
if (zswpout_after <= zswpout_before) {
ksft_print_msg("zswpout does not increase after test program\n");
goto out; goto out;
}
ret = KSFT_PASS;
/* Verify that no writeback happened because of the memcg allocation */
if (get_zswap_written_back_pages(&written_back_after))
goto out;
if (written_back_after == written_back_before)
ret = KSFT_PASS;
out: out:
cg_destroy(test_group); cg_destroy(test_group);
free(test_group); free(test_group);
return ret; return ret;
} }
/*
* Check that when memory.zswap.max = 0, no pages can go to the zswap pool for
* the cgroup.
*/
static int test_swapin_nozswap(const char *root)
{
int ret = KSFT_FAIL;
char *test_group;
long swap_peak, zswpout;
test_group = cg_name(root, "no_zswap_test");
if (!test_group)
goto out;
if (cg_create(test_group))
goto out;
if (cg_write(test_group, "memory.max", "8M"))
goto out;
if (cg_write(test_group, "memory.zswap.max", "0"))
goto out;
/* Allocate and read more than memory.max to trigger swapin */
if (cg_run(test_group, allocate_and_read_bytes, (void *)MB(32)))
goto out;
/* Verify that pages are swapped out, but no zswap happened */
swap_peak = cg_read_long(test_group, "memory.swap.peak");
if (swap_peak < 0) {
ksft_print_msg("failed to get cgroup's swap_peak\n");
goto out;
}
if (swap_peak < MB(24)) {
ksft_print_msg("at least 24MB of memory should be swapped out\n");
goto out;
}
zswpout = get_zswpout(test_group);
if (zswpout < 0) {
ksft_print_msg("failed to get zswpout\n");
goto out;
}
if (zswpout > 0) {
ksft_print_msg("zswapout > 0 when memory.zswap.max = 0\n");
goto out;
}
ret = KSFT_PASS;
out:
cg_destroy(test_group);
free(test_group);
return ret;
}
/* Simple test to verify the (z)swapin code paths */
static int test_zswapin(const char *root)
{
int ret = KSFT_FAIL;
char *test_group;
long zswpin;
test_group = cg_name(root, "zswapin_test");
if (!test_group)
goto out;
if (cg_create(test_group))
goto out;
if (cg_write(test_group, "memory.max", "8M"))
goto out;
if (cg_write(test_group, "memory.zswap.max", "max"))
goto out;
/* Allocate and read more than memory.max to trigger (z)swap in */
if (cg_run(test_group, allocate_and_read_bytes, (void *)MB(32)))
goto out;
zswpin = cg_read_key_long(test_group, "memory.stat", "zswpin ");
if (zswpin < 0) {
ksft_print_msg("failed to get zswpin\n");
goto out;
}
if (zswpin < MB(24) / PAGE_SIZE) {
ksft_print_msg("at least 24MB should be brought back from zswap\n");
goto out;
}
ret = KSFT_PASS;
out:
cg_destroy(test_group);
free(test_group);
return ret;
}
/*
* Attempt writeback with the following steps:
* 1. Allocate memory.
* 2. Reclaim memory equal to the amount that was allocated in step 1.
This will move it into zswap.
* 3. Save current zswap usage.
* 4. Move the memory allocated in step 1 back in from zswap.
* 5. Set zswap.max to half the amount that was recorded in step 3.
* 6. Attempt to reclaim memory equal to the amount that was allocated,
this will either trigger writeback if it's enabled, or reclamation
will fail if writeback is disabled as there isn't enough zswap space.
*/
static int attempt_writeback(const char *cgroup, void *arg)
{
long pagesize = sysconf(_SC_PAGESIZE);
size_t memsize = MB(4);
char buf[pagesize];
long zswap_usage;
bool wb_enabled = *(bool *) arg;
int ret = -1;
char *mem;
mem = (char *)malloc(memsize);
if (!mem)
return ret;
/*
* Fill half of each page with increasing data, and keep other
* half empty, this will result in data that is still compressible
* and ends up in zswap, with material zswap usage.
*/
for (int i = 0; i < pagesize; i++)
buf[i] = i < pagesize/2 ? (char) i : 0;
for (int i = 0; i < memsize; i += pagesize)
memcpy(&mem[i], buf, pagesize);
/* Try and reclaim allocated memory */
if (cg_write_numeric(cgroup, "memory.reclaim", memsize)) {
ksft_print_msg("Failed to reclaim all of the requested memory\n");
goto out;
}
zswap_usage = cg_read_long(cgroup, "memory.zswap.current");
/* zswpin */
for (int i = 0; i < memsize; i += pagesize) {
if (memcmp(&mem[i], buf, pagesize)) {
ksft_print_msg("invalid memory\n");
goto out;
}
}
if (cg_write_numeric(cgroup, "memory.zswap.max", zswap_usage/2))
goto out;
/*
* If writeback is enabled, trying to reclaim memory now will trigger a
* writeback as zswap.max is half of what was needed when reclaim ran the first time.
* If writeback is disabled, memory reclaim will fail as zswap is limited and
* it can't writeback to swap.
*/
ret = cg_write_numeric(cgroup, "memory.reclaim", memsize);
if (!wb_enabled)
ret = (ret == -EAGAIN) ? 0 : -1;
out:
free(mem);
return ret;
}
static int test_zswap_writeback_one(const char *cgroup, bool wb)
{
long zswpwb_before, zswpwb_after;
zswpwb_before = get_cg_wb_count(cgroup);
if (zswpwb_before != 0) {
ksft_print_msg("zswpwb_before = %ld instead of 0\n", zswpwb_before);
return -1;
}
if (cg_run(cgroup, attempt_writeback, (void *) &wb))
return -1;
/* Verify that zswap writeback occurred only if writeback was enabled */
zswpwb_after = get_cg_wb_count(cgroup);
if (zswpwb_after < 0)
return -1;
if (wb != !!zswpwb_after) {
ksft_print_msg("zswpwb_after is %ld while wb is %s",
zswpwb_after, wb ? "enabled" : "disabled");
return -1;
}
return 0;
}
/* Test to verify the zswap writeback path */
static int test_zswap_writeback(const char *root, bool wb)
{
int ret = KSFT_FAIL;
char *test_group, *test_group_child = NULL;
if (cg_read_strcmp(root, "memory.zswap.writeback", "1"))
return KSFT_SKIP;
test_group = cg_name(root, "zswap_writeback_test");
if (!test_group)
goto out;
if (cg_create(test_group))
goto out;
if (cg_write(test_group, "memory.zswap.writeback", wb ? "1" : "0"))
goto out;
if (test_zswap_writeback_one(test_group, wb))
goto out;
/* Reset memory.zswap.max to max (modified by attempt_writeback), and
* set up child cgroup, whose memory.zswap.writeback is hardcoded to 1.
* Thus, the parent's setting shall be what's in effect. */
if (cg_write(test_group, "memory.zswap.max", "max"))
goto out;
if (cg_write(test_group, "cgroup.subtree_control", "+memory"))
goto out;
test_group_child = cg_name(test_group, "zswap_writeback_test_child");
if (!test_group_child)
goto out;
if (cg_create(test_group_child))
goto out;
if (cg_write(test_group_child, "memory.zswap.writeback", "1"))
goto out;
if (test_zswap_writeback_one(test_group_child, wb))
goto out;
ret = KSFT_PASS;
out:
if (test_group_child) {
cg_destroy(test_group_child);
free(test_group_child);
}
cg_destroy(test_group);
free(test_group);
return ret;
}
static int test_zswap_writeback_enabled(const char *root)
{
return test_zswap_writeback(root, true);
}
static int test_zswap_writeback_disabled(const char *root)
{
return test_zswap_writeback(root, false);
}
/*
* When trying to store a memcg page in zswap, if the memcg hits its memory
* limit in zswap, writeback should affect only the zswapped pages of that
* memcg.
*/
static int test_no_invasive_cgroup_shrink(const char *root)
{
int ret = KSFT_FAIL;
size_t control_allocation_size = MB(10);
char *control_allocation, *wb_group = NULL, *control_group = NULL;
wb_group = setup_test_group_1M(root, "per_memcg_wb_test1");
if (!wb_group)
return KSFT_FAIL;
if (cg_write(wb_group, "memory.zswap.max", "10K"))
goto out;
control_group = setup_test_group_1M(root, "per_memcg_wb_test2");
if (!control_group)
goto out;
/* Push some test_group2 memory into zswap */
if (cg_enter_current(control_group))
goto out;
control_allocation = malloc(control_allocation_size);
for (int i = 0; i < control_allocation_size; i += 4095)
control_allocation[i] = 'a';
if (cg_read_key_long(control_group, "memory.stat", "zswapped") < 1)
goto out;
/* Allocate 10x memory.max to push wb_group memory into zswap and trigger wb */
if (cg_run(wb_group, allocate_bytes, (void *)MB(10)))
goto out;
/* Verify that only zswapped memory from gwb_group has been written back */
if (get_cg_wb_count(wb_group) > 0 && get_cg_wb_count(control_group) == 0)
ret = KSFT_PASS;
out:
cg_enter_current(root);
if (control_group) {
cg_destroy(control_group);
free(control_group);
}
cg_destroy(wb_group);
free(wb_group);
if (control_allocation)
free(control_allocation);
return ret;
}
struct no_kmem_bypass_child_args { struct no_kmem_bypass_child_args {
size_t target_alloc_bytes; size_t target_alloc_bytes;
size_t child_allocated; size_t child_allocated;
@ -177,8 +523,6 @@ static int test_no_kmem_bypass(const char *root)
trigger_allocation_size = sys_info.totalram / 20; trigger_allocation_size = sys_info.totalram / 20;
/* Set up test memcg */ /* Set up test memcg */
if (cg_write(root, "cgroup.subtree_control", "+memory"))
goto out;
test_group = cg_name(root, "kmem_bypass_test"); test_group = cg_name(root, "kmem_bypass_test");
if (!test_group) if (!test_group)
goto out; goto out;
@ -235,6 +579,11 @@ struct zswap_test {
int (*fn)(const char *root); int (*fn)(const char *root);
const char *name; const char *name;
} tests[] = { } tests[] = {
T(test_zswap_usage),
T(test_swapin_nozswap),
T(test_zswapin),
T(test_zswap_writeback_enabled),
T(test_zswap_writeback_disabled),
T(test_no_kmem_bypass), T(test_no_kmem_bypass),
T(test_no_invasive_cgroup_shrink), T(test_no_invasive_cgroup_shrink),
}; };