Merge: cgroup: Backport upstream cgroup commits up to v6.12
MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/6581 JIRA: https://issues.redhat.com/browse/RHEL-80382 This MR backports upstream cgroup commits up to v6.12 with relevant fixes, if applicable. Signed-off-by: Radostin Stoyanov <rstoyano@redhat.com> Approved-by: Waiman Long <longman@redhat.com> Approved-by: Rafael Aquini <raquini@redhat.com> Approved-by: CKI KWF Bot <cki-ci-bot+kwf-gitlab-com@redhat.com> Merged-by: Jan Stancek <jstancek@redhat.com>
This commit is contained in:
commit
ce052eaf83
|
@ -533,10 +533,12 @@ cgroup namespace on namespace creation.
|
|||
Because the resource control interface files in a given directory
|
||||
control the distribution of the parent's resources, the delegatee
|
||||
shouldn't be allowed to write to them. For the first method, this is
|
||||
achieved by not granting access to these files. For the second, the
|
||||
kernel rejects writes to all files other than "cgroup.procs" and
|
||||
"cgroup.subtree_control" on a namespace root from inside the
|
||||
namespace.
|
||||
achieved by not granting access to these files. For the second, files
|
||||
outside the namespace should be hidden from the delegatee by the means
|
||||
of at least mount namespacing, and the kernel rejects writes to all
|
||||
files on a namespace root from inside the cgroup namespace, except for
|
||||
those files listed in "/sys/kernel/cgroup/delegate" (including
|
||||
"cgroup.procs", "cgroup.threads", "cgroup.subtree_control", etc.).
|
||||
|
||||
The end results are equivalent for both delegation types. Once
|
||||
delegated, the user can build sub-hierarchy under the directory,
|
||||
|
@ -1708,6 +1710,8 @@ PAGE_SIZE multiple when read back.
|
|||
|
||||
Note that this is subtly different from setting memory.swap.max to
|
||||
0, as it still allows for pages to be written to the zswap pool.
|
||||
This setting has no effect if zswap is disabled, and swapping
|
||||
is allowed unless memory.swap.max is set to 0.
|
||||
|
||||
memory.pressure
|
||||
A read-only nested-keyed file.
|
||||
|
|
|
@ -4928,9 +4928,12 @@ S: Maintained
|
|||
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
|
||||
F: Documentation/admin-guide/cgroup-v1/cpusets.rst
|
||||
F: include/linux/cpuset.h
|
||||
F: kernel/cgroup/cpuset-internal.h
|
||||
F: kernel/cgroup/cpuset-v1.c
|
||||
F: kernel/cgroup/cpuset.c
|
||||
F: tools/testing/selftests/cgroup/test_cpuset.c
|
||||
F: tools/testing/selftests/cgroup/test_cpuset_prs.sh
|
||||
F: tools/testing/selftests/cgroup/test_cpuset_v1_base.sh
|
||||
|
||||
CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG)
|
||||
M: Johannes Weiner <hannes@cmpxchg.org>
|
||||
|
|
|
@ -172,7 +172,11 @@ struct cgroup_subsys_state {
|
|||
/* reference count - access via css_[try]get() and css_put() */
|
||||
struct percpu_ref refcnt;
|
||||
|
||||
/* siblings list anchored at the parent's ->children */
|
||||
/*
|
||||
* siblings list anchored at the parent's ->children
|
||||
*
|
||||
* linkage is protected by cgroup_mutex or RCU
|
||||
*/
|
||||
struct list_head sibling;
|
||||
struct list_head children;
|
||||
|
||||
|
@ -323,6 +327,7 @@ struct cgroup_base_stat {
|
|||
#ifdef CONFIG_SCHED_CORE
|
||||
u64 forceidle_sum;
|
||||
#endif
|
||||
u64 ntime;
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
|
@ -99,6 +99,7 @@ static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
|
|||
extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
|
||||
const struct task_struct *tsk2);
|
||||
|
||||
#ifdef CONFIG_CPUSETS_V1
|
||||
#define cpuset_memory_pressure_bump() \
|
||||
do { \
|
||||
if (cpuset_memory_pressure_enabled) \
|
||||
|
@ -106,6 +107,9 @@ extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
|
|||
} while (0)
|
||||
extern int cpuset_memory_pressure_enabled;
|
||||
extern void __cpuset_memory_pressure_bump(void);
|
||||
#else
|
||||
static inline void cpuset_memory_pressure_bump(void) { }
|
||||
#endif
|
||||
|
||||
extern void cpuset_task_status_allowed(struct seq_file *m,
|
||||
struct task_struct *task);
|
||||
|
@ -113,7 +117,6 @@ extern int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
|
|||
struct pid *pid, struct task_struct *tsk);
|
||||
|
||||
extern int cpuset_mem_spread_node(void);
|
||||
extern int cpuset_slab_spread_node(void);
|
||||
|
||||
static inline int cpuset_do_page_mem_spread(void)
|
||||
{
|
||||
|
@ -251,11 +254,6 @@ static inline int cpuset_mem_spread_node(void)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static inline int cpuset_slab_spread_node(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int cpuset_do_page_mem_spread(void)
|
||||
{
|
||||
return 0;
|
||||
|
|
|
@ -1433,7 +1433,6 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
|
|||
struct page *page, unsigned int nr, unsigned long addr);
|
||||
|
||||
vm_fault_t finish_fault(struct vm_fault *vmf);
|
||||
vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
|
||||
#endif
|
||||
|
||||
/*
|
||||
|
@ -1783,26 +1782,26 @@ static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
|
|||
|
||||
#define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid)
|
||||
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
|
||||
static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
|
||||
static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
|
||||
{
|
||||
return xchg(&page->_last_cpupid, cpupid & LAST_CPUPID_MASK);
|
||||
return xchg(&folio->_last_cpupid, cpupid & LAST_CPUPID_MASK);
|
||||
}
|
||||
|
||||
static inline int page_cpupid_last(struct page *page)
|
||||
static inline int folio_last_cpupid(struct folio *folio)
|
||||
{
|
||||
return page->_last_cpupid;
|
||||
return folio->_last_cpupid;
|
||||
}
|
||||
static inline void page_cpupid_reset_last(struct page *page)
|
||||
{
|
||||
page->_last_cpupid = -1 & LAST_CPUPID_MASK;
|
||||
}
|
||||
#else
|
||||
static inline int page_cpupid_last(struct page *page)
|
||||
static inline int folio_last_cpupid(struct folio *folio)
|
||||
{
|
||||
return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
|
||||
return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
|
||||
}
|
||||
|
||||
extern int page_cpupid_xchg_last(struct page *page, int cpupid);
|
||||
int folio_xchg_last_cpupid(struct folio *folio, int cpupid);
|
||||
|
||||
static inline void page_cpupid_reset_last(struct page *page)
|
||||
{
|
||||
|
@ -1810,11 +1809,12 @@ static inline void page_cpupid_reset_last(struct page *page)
|
|||
}
|
||||
#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
|
||||
|
||||
static inline int xchg_page_access_time(struct page *page, int time)
|
||||
static inline int folio_xchg_access_time(struct folio *folio, int time)
|
||||
{
|
||||
int last_time;
|
||||
|
||||
last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS);
|
||||
last_time = folio_xchg_last_cpupid(folio,
|
||||
time >> PAGE_ACCESS_TIME_BUCKETS);
|
||||
return last_time << PAGE_ACCESS_TIME_BUCKETS;
|
||||
}
|
||||
|
||||
|
@ -1828,19 +1828,19 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
|
|||
}
|
||||
}
|
||||
#else /* !CONFIG_NUMA_BALANCING */
|
||||
static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
|
||||
static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
|
||||
{
|
||||
return page_to_nid(page); /* XXX */
|
||||
return folio_nid(folio); /* XXX */
|
||||
}
|
||||
|
||||
static inline int xchg_page_access_time(struct page *page, int time)
|
||||
static inline int folio_xchg_access_time(struct folio *folio, int time)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int page_cpupid_last(struct page *page)
|
||||
static inline int folio_last_cpupid(struct folio *folio)
|
||||
{
|
||||
return page_to_nid(page); /* XXX */
|
||||
return folio_nid(folio); /* XXX */
|
||||
}
|
||||
|
||||
static inline int cpupid_to_nid(int cpupid)
|
||||
|
|
|
@ -190,6 +190,10 @@ struct page {
|
|||
not kmapped, ie. highmem) */
|
||||
#endif /* WANT_PAGE_VIRTUAL */
|
||||
|
||||
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
|
||||
int _last_cpupid;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_KMSAN
|
||||
/*
|
||||
* KMSAN metadata for this page:
|
||||
|
@ -201,10 +205,6 @@ struct page {
|
|||
struct page *kmsan_shadow;
|
||||
struct page *kmsan_origin;
|
||||
#endif
|
||||
|
||||
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
|
||||
int _last_cpupid;
|
||||
#endif
|
||||
} _struct_page_alignment;
|
||||
|
||||
/*
|
||||
|
@ -263,6 +263,8 @@ typedef struct {
|
|||
* @_refcount: Do not access this member directly. Use folio_ref_count()
|
||||
* to find how many references there are to this folio.
|
||||
* @memcg_data: Memory Control Group data.
|
||||
* @virtual: Virtual address in the kernel direct map.
|
||||
* @_last_cpupid: IDs of last CPU and last process that accessed the folio.
|
||||
* @_entire_mapcount: Do not use directly, call folio_entire_mapcount().
|
||||
* @_nr_pages_mapped: Do not use directly, call folio_mapcount().
|
||||
* @_pincount: Do not use directly, call folio_maybe_dma_pinned().
|
||||
|
@ -308,6 +310,12 @@ struct folio {
|
|||
atomic_t _refcount;
|
||||
#ifdef CONFIG_MEMCG
|
||||
unsigned long memcg_data;
|
||||
#endif
|
||||
#if defined(WANT_PAGE_VIRTUAL)
|
||||
void *virtual;
|
||||
#endif
|
||||
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
|
||||
int _last_cpupid;
|
||||
#endif
|
||||
/* private: the union with struct page is transitional */
|
||||
};
|
||||
|
@ -364,6 +372,12 @@ FOLIO_MATCH(_refcount, _refcount);
|
|||
#ifdef CONFIG_MEMCG
|
||||
FOLIO_MATCH(memcg_data, memcg_data);
|
||||
#endif
|
||||
#if defined(WANT_PAGE_VIRTUAL)
|
||||
FOLIO_MATCH(virtual, virtual);
|
||||
#endif
|
||||
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
|
||||
FOLIO_MATCH(_last_cpupid, _last_cpupid);
|
||||
#endif
|
||||
#undef FOLIO_MATCH
|
||||
#define FOLIO_MATCH(pg, fl) \
|
||||
static_assert(offsetof(struct folio, fl) == \
|
||||
|
|
|
@ -1244,7 +1244,6 @@ struct task_struct {
|
|||
/* Sequence number to catch updates: */
|
||||
seqcount_spinlock_t mems_allowed_seq;
|
||||
int cpuset_mem_spread_rotor;
|
||||
int cpuset_slab_spread_rotor;
|
||||
#endif
|
||||
#ifdef CONFIG_CGROUPS
|
||||
/* Control Group info protected by css_set_lock: */
|
||||
|
|
16
init/Kconfig
16
init/Kconfig
|
@ -1123,9 +1123,23 @@ config CPUSETS
|
|||
|
||||
Say N if unsure.
|
||||
|
||||
config CPUSETS_V1
|
||||
bool "Legacy cgroup v1 cpusets controller"
|
||||
depends on CPUSETS
|
||||
default n
|
||||
help
|
||||
Legacy cgroup v1 cpusets controller which has been deprecated by
|
||||
cgroup v2 implementation. The v1 is there for legacy applications
|
||||
which haven't migrated to the new cgroup v2 interface yet. Legacy
|
||||
interface includes cpuset filesystem and /proc/<pid>/cpuset. If you
|
||||
do not have any such application then you are completely fine leaving
|
||||
this option disabled.
|
||||
|
||||
Say N if unsure.
|
||||
|
||||
config PROC_PID_CPUSET
|
||||
bool "Include legacy /proc/<pid>/cpuset file"
|
||||
depends on CPUSETS
|
||||
depends on CPUSETS_V1
|
||||
default y
|
||||
|
||||
config CGROUP_DEVICE
|
||||
|
|
|
@ -5,5 +5,6 @@ obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o
|
|||
obj-$(CONFIG_CGROUP_PIDS) += pids.o
|
||||
obj-$(CONFIG_CGROUP_RDMA) += rdma.o
|
||||
obj-$(CONFIG_CPUSETS) += cpuset.o
|
||||
obj-$(CONFIG_CPUSETS_V1) += cpuset-v1.o
|
||||
obj-$(CONFIG_CGROUP_MISC) += misc.o
|
||||
obj-$(CONFIG_CGROUP_DEBUG) += debug.o
|
||||
|
|
|
@ -46,6 +46,12 @@ bool cgroup1_ssid_disabled(int ssid)
|
|||
return cgroup_no_v1_mask & (1 << ssid);
|
||||
}
|
||||
|
||||
static bool cgroup1_subsys_absent(struct cgroup_subsys *ss)
|
||||
{
|
||||
/* Check also dfl_cftypes for file-less controllers, i.e. perf_event */
|
||||
return ss->legacy_cftypes == NULL && ss->dfl_cftypes;
|
||||
}
|
||||
|
||||
/**
|
||||
* cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
|
||||
* @from: attach to all cgroups of a given task
|
||||
|
@ -675,11 +681,14 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
|
|||
* cgroup_mutex contention.
|
||||
*/
|
||||
|
||||
for_each_subsys(ss, i)
|
||||
for_each_subsys(ss, i) {
|
||||
if (cgroup1_subsys_absent(ss))
|
||||
continue;
|
||||
seq_printf(m, "%s\t%d\t%d\t%d\n",
|
||||
ss->legacy_name, ss->root->hierarchy_id,
|
||||
atomic_read(&ss->root->nr_cgrps),
|
||||
cgroup_ssid_enabled(i));
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -932,7 +941,8 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
|
|||
if (ret != -ENOPARAM)
|
||||
return ret;
|
||||
for_each_subsys(ss, i) {
|
||||
if (strcmp(param->key, ss->legacy_name))
|
||||
if (strcmp(param->key, ss->legacy_name) ||
|
||||
cgroup1_subsys_absent(ss))
|
||||
continue;
|
||||
if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i))
|
||||
return invalfc(fc, "Disabled controller '%s'",
|
||||
|
@ -1024,7 +1034,8 @@ static int check_cgroupfs_options(struct fs_context *fc)
|
|||
mask = ~((u16)1 << cpuset_cgrp_id);
|
||||
#endif
|
||||
for_each_subsys(ss, i)
|
||||
if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
|
||||
if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i) &&
|
||||
!cgroup1_subsys_absent(ss))
|
||||
enabled |= 1 << i;
|
||||
|
||||
ctx->subsys_mask &= enabled;
|
||||
|
|
|
@ -2331,7 +2331,7 @@ static struct file_system_type cgroup2_fs_type = {
|
|||
.fs_flags = FS_USERNS_MOUNT,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_CPUSETS
|
||||
#ifdef CONFIG_CPUSETS_V1
|
||||
static const struct fs_context_operations cpuset_fs_context_ops = {
|
||||
.get_tree = cgroup1_get_tree,
|
||||
.free = cgroup_fs_context_free,
|
||||
|
@ -4124,7 +4124,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
|
|||
* If namespaces are delegation boundaries, disallow writes to
|
||||
* files in an non-init namespace root from inside the namespace
|
||||
* except for the files explicitly marked delegatable -
|
||||
* cgroup.procs and cgroup.subtree_control.
|
||||
* eg. cgroup.procs, cgroup.threads and cgroup.subtree_control.
|
||||
*/
|
||||
if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
|
||||
!(cft->flags & CFTYPE_NS_DELEGATABLE) &&
|
||||
|
@ -4623,8 +4623,9 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
|
|||
*
|
||||
* While this function requires cgroup_mutex or RCU read locking, it
|
||||
* doesn't require the whole traversal to be contained in a single critical
|
||||
* section. This function will return the correct next descendant as long
|
||||
* as both @pos and @root are accessible and @pos is a descendant of @root.
|
||||
* section. Additionally, it isn't necessary to hold onto a reference to @pos.
|
||||
* This function will return the correct next descendant as long as both @pos
|
||||
* and @root are accessible and @pos is a descendant of @root.
|
||||
*
|
||||
* If a subsystem synchronizes ->css_online() and the start of iteration, a
|
||||
* css which finished ->css_online() is guaranteed to be visible in the
|
||||
|
@ -4672,8 +4673,9 @@ EXPORT_SYMBOL_GPL(css_next_descendant_pre);
|
|||
*
|
||||
* While this function requires cgroup_mutex or RCU read locking, it
|
||||
* doesn't require the whole traversal to be contained in a single critical
|
||||
* section. This function will return the correct rightmost descendant as
|
||||
* long as @pos is accessible.
|
||||
* section. Additionally, it isn't necessary to hold onto a reference to @pos.
|
||||
* This function will return the correct rightmost descendant as long as @pos
|
||||
* is accessible.
|
||||
*/
|
||||
struct cgroup_subsys_state *
|
||||
css_rightmost_descendant(struct cgroup_subsys_state *pos)
|
||||
|
@ -4717,9 +4719,9 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos)
|
|||
*
|
||||
* While this function requires cgroup_mutex or RCU read locking, it
|
||||
* doesn't require the whole traversal to be contained in a single critical
|
||||
* section. This function will return the correct next descendant as long
|
||||
* as both @pos and @cgroup are accessible and @pos is a descendant of
|
||||
* @cgroup.
|
||||
* section. Additionally, it isn't necessary to hold onto a reference to @pos.
|
||||
* This function will return the correct next descendant as long as both @pos
|
||||
* and @cgroup are accessible and @pos is a descendant of @cgroup.
|
||||
*
|
||||
* If a subsystem synchronizes ->css_online() and the start of iteration, a
|
||||
* css which finished ->css_online() is guaranteed to be visible in the
|
||||
|
@ -5780,7 +5782,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
|
|||
{
|
||||
struct cgroup *cgroup;
|
||||
int ret = false;
|
||||
int level = 1;
|
||||
int level = 0;
|
||||
|
||||
lockdep_assert_held(&cgroup_mutex);
|
||||
|
||||
|
@ -5788,7 +5790,7 @@ static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
|
|||
if (cgroup->nr_descendants >= cgroup->max_descendants)
|
||||
goto fail;
|
||||
|
||||
if (level > cgroup->max_depth)
|
||||
if (level >= cgroup->max_depth)
|
||||
goto fail;
|
||||
|
||||
level++;
|
||||
|
@ -6242,7 +6244,7 @@ int __init cgroup_init(void)
|
|||
WARN_ON(register_filesystem(&cgroup_fs_type));
|
||||
WARN_ON(register_filesystem(&cgroup2_fs_type));
|
||||
WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
|
||||
#ifdef CONFIG_CPUSETS
|
||||
#ifdef CONFIG_CPUSETS_V1
|
||||
WARN_ON(register_filesystem(&cpuset_fs_type));
|
||||
#endif
|
||||
|
||||
|
|
|
@ -0,0 +1,305 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
|
||||
#ifndef __CPUSET_INTERNAL_H
|
||||
#define __CPUSET_INTERNAL_H
|
||||
|
||||
#include <linux/cgroup.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/cpumask.h>
|
||||
#include <linux/cpuset.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/union_find.h>
|
||||
|
||||
/* See "Frequency meter" comments, below. */
|
||||
|
||||
struct fmeter {
|
||||
int cnt; /* unprocessed events count */
|
||||
int val; /* most recent output value */
|
||||
time64_t time; /* clock (secs) when val computed */
|
||||
spinlock_t lock; /* guards read or write of above */
|
||||
};
|
||||
|
||||
/*
|
||||
* Invalid partition error code
|
||||
*/
|
||||
enum prs_errcode {
|
||||
PERR_NONE = 0,
|
||||
PERR_INVCPUS,
|
||||
PERR_INVPARENT,
|
||||
PERR_NOTPART,
|
||||
PERR_NOTEXCL,
|
||||
PERR_NOCPUS,
|
||||
PERR_HOTPLUG,
|
||||
PERR_CPUSEMPTY,
|
||||
PERR_HKEEPING,
|
||||
PERR_ACCESS,
|
||||
};
|
||||
|
||||
/* bits in struct cpuset flags field */
|
||||
typedef enum {
|
||||
CS_ONLINE,
|
||||
CS_CPU_EXCLUSIVE,
|
||||
CS_MEM_EXCLUSIVE,
|
||||
CS_MEM_HARDWALL,
|
||||
CS_MEMORY_MIGRATE,
|
||||
CS_SCHED_LOAD_BALANCE,
|
||||
CS_SPREAD_PAGE,
|
||||
CS_SPREAD_SLAB,
|
||||
} cpuset_flagbits_t;
|
||||
|
||||
/* The various types of files and directories in a cpuset file system */
|
||||
|
||||
typedef enum {
|
||||
FILE_MEMORY_MIGRATE,
|
||||
FILE_CPULIST,
|
||||
FILE_MEMLIST,
|
||||
FILE_EFFECTIVE_CPULIST,
|
||||
FILE_EFFECTIVE_MEMLIST,
|
||||
FILE_SUBPARTS_CPULIST,
|
||||
FILE_EXCLUSIVE_CPULIST,
|
||||
FILE_EFFECTIVE_XCPULIST,
|
||||
FILE_ISOLATED_CPULIST,
|
||||
FILE_CPU_EXCLUSIVE,
|
||||
FILE_MEM_EXCLUSIVE,
|
||||
FILE_MEM_HARDWALL,
|
||||
FILE_SCHED_LOAD_BALANCE,
|
||||
FILE_PARTITION_ROOT,
|
||||
FILE_SCHED_RELAX_DOMAIN_LEVEL,
|
||||
FILE_MEMORY_PRESSURE_ENABLED,
|
||||
FILE_MEMORY_PRESSURE,
|
||||
FILE_SPREAD_PAGE,
|
||||
FILE_SPREAD_SLAB,
|
||||
} cpuset_filetype_t;
|
||||
|
||||
struct cpuset {
|
||||
struct cgroup_subsys_state css;
|
||||
|
||||
unsigned long flags; /* "unsigned long" so bitops work */
|
||||
|
||||
/*
|
||||
* On default hierarchy:
|
||||
*
|
||||
* The user-configured masks can only be changed by writing to
|
||||
* cpuset.cpus and cpuset.mems, and won't be limited by the
|
||||
* parent masks.
|
||||
*
|
||||
* The effective masks is the real masks that apply to the tasks
|
||||
* in the cpuset. They may be changed if the configured masks are
|
||||
* changed or hotplug happens.
|
||||
*
|
||||
* effective_mask == configured_mask & parent's effective_mask,
|
||||
* and if it ends up empty, it will inherit the parent's mask.
|
||||
*
|
||||
*
|
||||
* On legacy hierarchy:
|
||||
*
|
||||
* The user-configured masks are always the same with effective masks.
|
||||
*/
|
||||
|
||||
/* user-configured CPUs and Memory Nodes allow to tasks */
|
||||
cpumask_var_t cpus_allowed;
|
||||
nodemask_t mems_allowed;
|
||||
|
||||
/* effective CPUs and Memory Nodes allow to tasks */
|
||||
cpumask_var_t effective_cpus;
|
||||
nodemask_t effective_mems;
|
||||
|
||||
/*
|
||||
* Exclusive CPUs dedicated to current cgroup (default hierarchy only)
|
||||
*
|
||||
* The effective_cpus of a valid partition root comes solely from its
|
||||
* effective_xcpus and some of the effective_xcpus may be distributed
|
||||
* to sub-partitions below & hence excluded from its effective_cpus.
|
||||
* For a valid partition root, its effective_cpus have no relationship
|
||||
* with cpus_allowed unless its exclusive_cpus isn't set.
|
||||
*
|
||||
* This value will only be set if either exclusive_cpus is set or
|
||||
* when this cpuset becomes a local partition root.
|
||||
*/
|
||||
cpumask_var_t effective_xcpus;
|
||||
|
||||
/*
|
||||
* Exclusive CPUs as requested by the user (default hierarchy only)
|
||||
*
|
||||
* Its value is independent of cpus_allowed and designates the set of
|
||||
* CPUs that can be granted to the current cpuset or its children when
|
||||
* it becomes a valid partition root. The effective set of exclusive
|
||||
* CPUs granted (effective_xcpus) depends on whether those exclusive
|
||||
* CPUs are passed down by its ancestors and not yet taken up by
|
||||
* another sibling partition root along the way.
|
||||
*
|
||||
* If its value isn't set, it defaults to cpus_allowed.
|
||||
*/
|
||||
cpumask_var_t exclusive_cpus;
|
||||
|
||||
/*
|
||||
* This is old Memory Nodes tasks took on.
|
||||
*
|
||||
* - top_cpuset.old_mems_allowed is initialized to mems_allowed.
|
||||
* - A new cpuset's old_mems_allowed is initialized when some
|
||||
* task is moved into it.
|
||||
* - old_mems_allowed is used in cpuset_migrate_mm() when we change
|
||||
* cpuset.mems_allowed and have tasks' nodemask updated, and
|
||||
* then old_mems_allowed is updated to mems_allowed.
|
||||
*/
|
||||
nodemask_t old_mems_allowed;
|
||||
|
||||
struct fmeter fmeter; /* memory_pressure filter */
|
||||
|
||||
/*
|
||||
* Tasks are being attached to this cpuset. Used to prevent
|
||||
* zeroing cpus/mems_allowed between ->can_attach() and ->attach().
|
||||
*/
|
||||
int attach_in_progress;
|
||||
|
||||
/* for custom sched domain */
|
||||
int relax_domain_level;
|
||||
|
||||
/* number of valid local child partitions */
|
||||
int nr_subparts;
|
||||
|
||||
/* partition root state */
|
||||
int partition_root_state;
|
||||
|
||||
/*
|
||||
* number of SCHED_DEADLINE tasks attached to this cpuset, so that we
|
||||
* know when to rebuild associated root domain bandwidth information.
|
||||
*/
|
||||
int nr_deadline_tasks;
|
||||
int nr_migrate_dl_tasks;
|
||||
u64 sum_migrate_dl_bw;
|
||||
|
||||
/* Invalid partition error code, not lock protected */
|
||||
enum prs_errcode prs_err;
|
||||
|
||||
/* Handle for cpuset.cpus.partition */
|
||||
struct cgroup_file partition_file;
|
||||
|
||||
/* Remote partition silbling list anchored at remote_children */
|
||||
struct list_head remote_sibling;
|
||||
|
||||
/* Used to merge intersecting subsets for generate_sched_domains */
|
||||
struct uf_node node;
|
||||
};
|
||||
|
||||
static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
|
||||
{
|
||||
return css ? container_of(css, struct cpuset, css) : NULL;
|
||||
}
|
||||
|
||||
/* Retrieve the cpuset for a task */
|
||||
static inline struct cpuset *task_cs(struct task_struct *task)
|
||||
{
|
||||
return css_cs(task_css(task, cpuset_cgrp_id));
|
||||
}
|
||||
|
||||
static inline struct cpuset *parent_cs(struct cpuset *cs)
|
||||
{
|
||||
return css_cs(cs->css.parent);
|
||||
}
|
||||
|
||||
/* convenient tests for these bits */
|
||||
static inline bool is_cpuset_online(struct cpuset *cs)
|
||||
{
|
||||
return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
|
||||
}
|
||||
|
||||
static inline int is_cpu_exclusive(const struct cpuset *cs)
|
||||
{
|
||||
return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
|
||||
}
|
||||
|
||||
static inline int is_mem_exclusive(const struct cpuset *cs)
|
||||
{
|
||||
return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
|
||||
}
|
||||
|
||||
static inline int is_mem_hardwall(const struct cpuset *cs)
|
||||
{
|
||||
return test_bit(CS_MEM_HARDWALL, &cs->flags);
|
||||
}
|
||||
|
||||
static inline int is_sched_load_balance(const struct cpuset *cs)
|
||||
{
|
||||
return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
|
||||
}
|
||||
|
||||
static inline int is_memory_migrate(const struct cpuset *cs)
|
||||
{
|
||||
return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
|
||||
}
|
||||
|
||||
static inline int is_spread_page(const struct cpuset *cs)
|
||||
{
|
||||
return test_bit(CS_SPREAD_PAGE, &cs->flags);
|
||||
}
|
||||
|
||||
static inline int is_spread_slab(const struct cpuset *cs)
|
||||
{
|
||||
return test_bit(CS_SPREAD_SLAB, &cs->flags);
|
||||
}
|
||||
|
||||
/**
|
||||
* cpuset_for_each_child - traverse online children of a cpuset
|
||||
* @child_cs: loop cursor pointing to the current child
|
||||
* @pos_css: used for iteration
|
||||
* @parent_cs: target cpuset to walk children of
|
||||
*
|
||||
* Walk @child_cs through the online children of @parent_cs. Must be used
|
||||
* with RCU read locked.
|
||||
*/
|
||||
#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
|
||||
css_for_each_child((pos_css), &(parent_cs)->css) \
|
||||
if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
|
||||
|
||||
/**
|
||||
* cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
|
||||
* @des_cs: loop cursor pointing to the current descendant
|
||||
* @pos_css: used for iteration
|
||||
* @root_cs: target cpuset to walk ancestor of
|
||||
*
|
||||
* Walk @des_cs through the online descendants of @root_cs. Must be used
|
||||
* with RCU read locked. The caller may modify @pos_css by calling
|
||||
* css_rightmost_descendant() to skip subtree. @root_cs is included in the
|
||||
* iteration and the first node to be visited.
|
||||
*/
|
||||
#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
|
||||
css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
|
||||
if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
|
||||
|
||||
void rebuild_sched_domains_locked(void);
|
||||
void cpuset_callback_lock_irq(void);
|
||||
void cpuset_callback_unlock_irq(void);
|
||||
void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus);
|
||||
void cpuset_update_tasks_nodemask(struct cpuset *cs);
|
||||
int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on);
|
||||
ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes, loff_t off);
|
||||
int cpuset_common_seq_show(struct seq_file *sf, void *v);
|
||||
|
||||
/*
|
||||
* cpuset-v1.c
|
||||
*/
|
||||
#ifdef CONFIG_CPUSETS_V1
|
||||
extern struct cftype cpuset1_files[];
|
||||
void fmeter_init(struct fmeter *fmp);
|
||||
void cpuset1_update_task_spread_flags(struct cpuset *cs,
|
||||
struct task_struct *tsk);
|
||||
void cpuset1_update_tasks_flags(struct cpuset *cs);
|
||||
void cpuset1_hotplug_update_tasks(struct cpuset *cs,
|
||||
struct cpumask *new_cpus, nodemask_t *new_mems,
|
||||
bool cpus_updated, bool mems_updated);
|
||||
int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial);
|
||||
#else
|
||||
static inline void fmeter_init(struct fmeter *fmp) {}
|
||||
static inline void cpuset1_update_task_spread_flags(struct cpuset *cs,
|
||||
struct task_struct *tsk) {}
|
||||
static inline void cpuset1_update_tasks_flags(struct cpuset *cs) {}
|
||||
static inline void cpuset1_hotplug_update_tasks(struct cpuset *cs,
|
||||
struct cpumask *new_cpus, nodemask_t *new_mems,
|
||||
bool cpus_updated, bool mems_updated) {}
|
||||
static inline int cpuset1_validate_change(struct cpuset *cur,
|
||||
struct cpuset *trial) { return 0; }
|
||||
#endif /* CONFIG_CPUSETS_V1 */
|
||||
|
||||
#endif /* __CPUSET_INTERNAL_H */
|
|
@ -0,0 +1,603 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#include "cgroup-internal.h"
|
||||
#include "cpuset-internal.h"
|
||||
|
||||
/*
|
||||
* Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
|
||||
*/
|
||||
struct cpuset_remove_tasks_struct {
|
||||
struct work_struct work;
|
||||
struct cpuset *cs;
|
||||
};
|
||||
|
||||
/*
|
||||
* Frequency meter - How fast is some event occurring?
|
||||
*
|
||||
* These routines manage a digitally filtered, constant time based,
|
||||
* event frequency meter. There are four routines:
|
||||
* fmeter_init() - initialize a frequency meter.
|
||||
* fmeter_markevent() - called each time the event happens.
|
||||
* fmeter_getrate() - returns the recent rate of such events.
|
||||
* fmeter_update() - internal routine used to update fmeter.
|
||||
*
|
||||
* A common data structure is passed to each of these routines,
|
||||
* which is used to keep track of the state required to manage the
|
||||
* frequency meter and its digital filter.
|
||||
*
|
||||
* The filter works on the number of events marked per unit time.
|
||||
* The filter is single-pole low-pass recursive (IIR). The time unit
|
||||
* is 1 second. Arithmetic is done using 32-bit integers scaled to
|
||||
* simulate 3 decimal digits of precision (multiplied by 1000).
|
||||
*
|
||||
* With an FM_COEF of 933, and a time base of 1 second, the filter
|
||||
* has a half-life of 10 seconds, meaning that if the events quit
|
||||
* happening, then the rate returned from the fmeter_getrate()
|
||||
* will be cut in half each 10 seconds, until it converges to zero.
|
||||
*
|
||||
* It is not worth doing a real infinitely recursive filter. If more
|
||||
* than FM_MAXTICKS ticks have elapsed since the last filter event,
|
||||
* just compute FM_MAXTICKS ticks worth, by which point the level
|
||||
* will be stable.
|
||||
*
|
||||
* Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
|
||||
* arithmetic overflow in the fmeter_update() routine.
|
||||
*
|
||||
* Given the simple 32 bit integer arithmetic used, this meter works
|
||||
* best for reporting rates between one per millisecond (msec) and
|
||||
* one per 32 (approx) seconds. At constant rates faster than one
|
||||
* per msec it maxes out at values just under 1,000,000. At constant
|
||||
* rates between one per msec, and one per second it will stabilize
|
||||
* to a value N*1000, where N is the rate of events per second.
|
||||
* At constant rates between one per second and one per 32 seconds,
|
||||
* it will be choppy, moving up on the seconds that have an event,
|
||||
* and then decaying until the next event. At rates slower than
|
||||
* about one in 32 seconds, it decays all the way back to zero between
|
||||
* each event.
|
||||
*/
|
||||
|
||||
#define FM_COEF 933 /* coefficient for half-life of 10 secs */
|
||||
#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
|
||||
#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
|
||||
#define FM_SCALE 1000 /* faux fixed point scale */
|
||||
|
||||
/* Initialize a frequency meter */
|
||||
void fmeter_init(struct fmeter *fmp)
|
||||
{
|
||||
fmp->cnt = 0;
|
||||
fmp->val = 0;
|
||||
fmp->time = 0;
|
||||
spin_lock_init(&fmp->lock);
|
||||
}
|
||||
|
||||
/* Internal meter update - process cnt events and update value */
|
||||
static void fmeter_update(struct fmeter *fmp)
|
||||
{
|
||||
time64_t now;
|
||||
u32 ticks;
|
||||
|
||||
now = ktime_get_seconds();
|
||||
ticks = now - fmp->time;
|
||||
|
||||
if (ticks == 0)
|
||||
return;
|
||||
|
||||
ticks = min(FM_MAXTICKS, ticks);
|
||||
while (ticks-- > 0)
|
||||
fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
|
||||
fmp->time = now;
|
||||
|
||||
fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
|
||||
fmp->cnt = 0;
|
||||
}
|
||||
|
||||
/* Process any previous ticks, then bump cnt by one (times scale). */
|
||||
static void fmeter_markevent(struct fmeter *fmp)
|
||||
{
|
||||
spin_lock(&fmp->lock);
|
||||
fmeter_update(fmp);
|
||||
fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
|
||||
spin_unlock(&fmp->lock);
|
||||
}
|
||||
|
||||
/* Process any previous ticks, then return current value. */
|
||||
static int fmeter_getrate(struct fmeter *fmp)
|
||||
{
|
||||
int val;
|
||||
|
||||
spin_lock(&fmp->lock);
|
||||
fmeter_update(fmp);
|
||||
val = fmp->val;
|
||||
spin_unlock(&fmp->lock);
|
||||
return val;
|
||||
}
|
||||
|
||||
/*
|
||||
* Collection of memory_pressure is suppressed unless
|
||||
* this flag is enabled by writing "1" to the special
|
||||
* cpuset file 'memory_pressure_enabled' in the root cpuset.
|
||||
*/
|
||||
|
||||
int cpuset_memory_pressure_enabled __read_mostly;
|
||||
|
||||
/*
|
||||
* __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
|
||||
*
|
||||
* Keep a running average of the rate of synchronous (direct)
|
||||
* page reclaim efforts initiated by tasks in each cpuset.
|
||||
*
|
||||
* This represents the rate at which some task in the cpuset
|
||||
* ran low on memory on all nodes it was allowed to use, and
|
||||
* had to enter the kernels page reclaim code in an effort to
|
||||
* create more free memory by tossing clean pages or swapping
|
||||
* or writing dirty pages.
|
||||
*
|
||||
* Display to user space in the per-cpuset read-only file
|
||||
* "memory_pressure". Value displayed is an integer
|
||||
* representing the recent rate of entry into the synchronous
|
||||
* (direct) page reclaim by any task attached to the cpuset.
|
||||
*/
|
||||
|
||||
void __cpuset_memory_pressure_bump(void)
|
||||
{
|
||||
rcu_read_lock();
|
||||
fmeter_markevent(&task_cs(current)->fmeter);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static int update_relax_domain_level(struct cpuset *cs, s64 val)
|
||||
{
|
||||
#ifdef CONFIG_SMP
|
||||
if (val < -1 || val > sched_domain_level_max + 1)
|
||||
return -EINVAL;
|
||||
#endif
|
||||
|
||||
if (val != cs->relax_domain_level) {
|
||||
cs->relax_domain_level = val;
|
||||
if (!cpumask_empty(cs->cpus_allowed) &&
|
||||
is_sched_load_balance(cs))
|
||||
rebuild_sched_domains_locked();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
|
||||
s64 val)
|
||||
{
|
||||
struct cpuset *cs = css_cs(css);
|
||||
cpuset_filetype_t type = cft->private;
|
||||
int retval = -ENODEV;
|
||||
|
||||
cpus_read_lock();
|
||||
cpuset_lock();
|
||||
if (!is_cpuset_online(cs))
|
||||
goto out_unlock;
|
||||
|
||||
switch (type) {
|
||||
case FILE_SCHED_RELAX_DOMAIN_LEVEL:
|
||||
retval = update_relax_domain_level(cs, val);
|
||||
break;
|
||||
default:
|
||||
retval = -EINVAL;
|
||||
break;
|
||||
}
|
||||
out_unlock:
|
||||
cpuset_unlock();
|
||||
cpus_read_unlock();
|
||||
return retval;
|
||||
}
|
||||
|
||||
static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
|
||||
{
|
||||
struct cpuset *cs = css_cs(css);
|
||||
cpuset_filetype_t type = cft->private;
|
||||
|
||||
switch (type) {
|
||||
case FILE_SCHED_RELAX_DOMAIN_LEVEL:
|
||||
return cs->relax_domain_level;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
|
||||
/* Unreachable but makes gcc happy */
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* update task's spread flag if cpuset's page/slab spread flag is set
|
||||
*
|
||||
* Call with callback_lock or cpuset_mutex held. The check can be skipped
|
||||
* if on default hierarchy.
|
||||
*/
|
||||
void cpuset1_update_task_spread_flags(struct cpuset *cs,
|
||||
struct task_struct *tsk)
|
||||
{
|
||||
if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
|
||||
return;
|
||||
|
||||
if (is_spread_page(cs))
|
||||
task_set_spread_page(tsk);
|
||||
else
|
||||
task_clear_spread_page(tsk);
|
||||
|
||||
if (is_spread_slab(cs))
|
||||
task_set_spread_slab(tsk);
|
||||
else
|
||||
task_clear_spread_slab(tsk);
|
||||
}
|
||||
|
||||
/**
|
||||
* cpuset1_update_tasks_flags - update the spread flags of tasks in the cpuset.
|
||||
* @cs: the cpuset in which each task's spread flags needs to be changed
|
||||
*
|
||||
* Iterate through each task of @cs updating its spread flags. As this
|
||||
* function is called with cpuset_mutex held, cpuset membership stays
|
||||
* stable.
|
||||
*/
|
||||
void cpuset1_update_tasks_flags(struct cpuset *cs)
|
||||
{
|
||||
struct css_task_iter it;
|
||||
struct task_struct *task;
|
||||
|
||||
css_task_iter_start(&cs->css, 0, &it);
|
||||
while ((task = css_task_iter_next(&it)))
|
||||
cpuset1_update_task_spread_flags(cs, task);
|
||||
css_task_iter_end(&it);
|
||||
}
|
||||
|
||||
/*
|
||||
* If CPU and/or memory hotplug handlers, below, unplug any CPUs
|
||||
* or memory nodes, we need to walk over the cpuset hierarchy,
|
||||
* removing that CPU or node from all cpusets. If this removes the
|
||||
* last CPU or node from a cpuset, then move the tasks in the empty
|
||||
* cpuset to its next-highest non-empty parent.
|
||||
*/
|
||||
static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
|
||||
{
|
||||
struct cpuset *parent;
|
||||
|
||||
/*
|
||||
* Find its next-highest non-empty parent, (top cpuset
|
||||
* has online cpus, so can't be empty).
|
||||
*/
|
||||
parent = parent_cs(cs);
|
||||
while (cpumask_empty(parent->cpus_allowed) ||
|
||||
nodes_empty(parent->mems_allowed))
|
||||
parent = parent_cs(parent);
|
||||
|
||||
if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
|
||||
pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
|
||||
pr_cont_cgroup_name(cs->css.cgroup);
|
||||
pr_cont("\n");
|
||||
}
|
||||
}
|
||||
|
||||
static void cpuset_migrate_tasks_workfn(struct work_struct *work)
|
||||
{
|
||||
struct cpuset_remove_tasks_struct *s;
|
||||
|
||||
s = container_of(work, struct cpuset_remove_tasks_struct, work);
|
||||
remove_tasks_in_empty_cpuset(s->cs);
|
||||
css_put(&s->cs->css);
|
||||
kfree(s);
|
||||
}
|
||||
|
||||
void cpuset1_hotplug_update_tasks(struct cpuset *cs,
|
||||
struct cpumask *new_cpus, nodemask_t *new_mems,
|
||||
bool cpus_updated, bool mems_updated)
|
||||
{
|
||||
bool is_empty;
|
||||
|
||||
cpuset_callback_lock_irq();
|
||||
cpumask_copy(cs->cpus_allowed, new_cpus);
|
||||
cpumask_copy(cs->effective_cpus, new_cpus);
|
||||
cs->mems_allowed = *new_mems;
|
||||
cs->effective_mems = *new_mems;
|
||||
cpuset_callback_unlock_irq();
|
||||
|
||||
/*
|
||||
* Don't call cpuset_update_tasks_cpumask() if the cpuset becomes empty,
|
||||
* as the tasks will be migrated to an ancestor.
|
||||
*/
|
||||
if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
|
||||
cpuset_update_tasks_cpumask(cs, new_cpus);
|
||||
if (mems_updated && !nodes_empty(cs->mems_allowed))
|
||||
cpuset_update_tasks_nodemask(cs);
|
||||
|
||||
is_empty = cpumask_empty(cs->cpus_allowed) ||
|
||||
nodes_empty(cs->mems_allowed);
|
||||
|
||||
/*
|
||||
* Move tasks to the nearest ancestor with execution resources,
|
||||
* This is full cgroup operation which will also call back into
|
||||
* cpuset. Execute it asynchronously using workqueue.
|
||||
*/
|
||||
if (is_empty && cs->css.cgroup->nr_populated_csets &&
|
||||
css_tryget_online(&cs->css)) {
|
||||
struct cpuset_remove_tasks_struct *s;
|
||||
|
||||
s = kzalloc(sizeof(*s), GFP_KERNEL);
|
||||
if (WARN_ON_ONCE(!s)) {
|
||||
css_put(&cs->css);
|
||||
return;
|
||||
}
|
||||
|
||||
s->cs = cs;
|
||||
INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
|
||||
schedule_work(&s->work);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
|
||||
*
|
||||
* One cpuset is a subset of another if all its allowed CPUs and
|
||||
* Memory Nodes are a subset of the other, and its exclusive flags
|
||||
* are only set if the other's are set. Call holding cpuset_mutex.
|
||||
*/
|
||||
|
||||
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
|
||||
{
|
||||
return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
|
||||
nodes_subset(p->mems_allowed, q->mems_allowed) &&
|
||||
is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
|
||||
is_mem_exclusive(p) <= is_mem_exclusive(q);
|
||||
}
|
||||
|
||||
/*
|
||||
* cpuset1_validate_change() - Validate conditions specific to legacy (v1)
|
||||
* behavior.
|
||||
*/
|
||||
int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial)
|
||||
{
|
||||
struct cgroup_subsys_state *css;
|
||||
struct cpuset *c, *par;
|
||||
int ret;
|
||||
|
||||
WARN_ON_ONCE(!rcu_read_lock_held());
|
||||
|
||||
/* Each of our child cpusets must be a subset of us */
|
||||
ret = -EBUSY;
|
||||
cpuset_for_each_child(c, css, cur)
|
||||
if (!is_cpuset_subset(c, trial))
|
||||
goto out;
|
||||
|
||||
/* On legacy hierarchy, we must be a subset of our parent cpuset. */
|
||||
ret = -EACCES;
|
||||
par = parent_cs(cur);
|
||||
if (par && !is_cpuset_subset(trial, par))
|
||||
goto out;
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PROC_PID_CPUSET
|
||||
/*
|
||||
* proc_cpuset_show()
|
||||
* - Print tasks cpuset path into seq_file.
|
||||
* - Used for /proc/<pid>/cpuset.
|
||||
*/
|
||||
int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
|
||||
struct pid *pid, struct task_struct *tsk)
|
||||
{
|
||||
char *buf;
|
||||
struct cgroup_subsys_state *css;
|
||||
int retval;
|
||||
|
||||
retval = -ENOMEM;
|
||||
buf = kmalloc(PATH_MAX, GFP_KERNEL);
|
||||
if (!buf)
|
||||
goto out;
|
||||
|
||||
rcu_read_lock();
|
||||
spin_lock_irq(&css_set_lock);
|
||||
css = task_css(tsk, cpuset_cgrp_id);
|
||||
retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
|
||||
current->nsproxy->cgroup_ns);
|
||||
spin_unlock_irq(&css_set_lock);
|
||||
rcu_read_unlock();
|
||||
|
||||
if (retval == -E2BIG)
|
||||
retval = -ENAMETOOLONG;
|
||||
if (retval < 0)
|
||||
goto out_free;
|
||||
seq_puts(m, buf);
|
||||
seq_putc(m, '\n');
|
||||
retval = 0;
|
||||
out_free:
|
||||
kfree(buf);
|
||||
out:
|
||||
return retval;
|
||||
}
|
||||
#endif /* CONFIG_PROC_PID_CPUSET */
|
||||
|
||||
static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
|
||||
{
|
||||
struct cpuset *cs = css_cs(css);
|
||||
cpuset_filetype_t type = cft->private;
|
||||
|
||||
switch (type) {
|
||||
case FILE_CPU_EXCLUSIVE:
|
||||
return is_cpu_exclusive(cs);
|
||||
case FILE_MEM_EXCLUSIVE:
|
||||
return is_mem_exclusive(cs);
|
||||
case FILE_MEM_HARDWALL:
|
||||
return is_mem_hardwall(cs);
|
||||
case FILE_SCHED_LOAD_BALANCE:
|
||||
return is_sched_load_balance(cs);
|
||||
case FILE_MEMORY_MIGRATE:
|
||||
return is_memory_migrate(cs);
|
||||
case FILE_MEMORY_PRESSURE_ENABLED:
|
||||
return cpuset_memory_pressure_enabled;
|
||||
case FILE_MEMORY_PRESSURE:
|
||||
return fmeter_getrate(&cs->fmeter);
|
||||
case FILE_SPREAD_PAGE:
|
||||
return is_spread_page(cs);
|
||||
case FILE_SPREAD_SLAB:
|
||||
return is_spread_slab(cs);
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
|
||||
/* Unreachable but makes gcc happy */
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
|
||||
u64 val)
|
||||
{
|
||||
struct cpuset *cs = css_cs(css);
|
||||
cpuset_filetype_t type = cft->private;
|
||||
int retval = 0;
|
||||
|
||||
cpus_read_lock();
|
||||
cpuset_lock();
|
||||
if (!is_cpuset_online(cs)) {
|
||||
retval = -ENODEV;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
switch (type) {
|
||||
case FILE_CPU_EXCLUSIVE:
|
||||
retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val);
|
||||
break;
|
||||
case FILE_MEM_EXCLUSIVE:
|
||||
retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val);
|
||||
break;
|
||||
case FILE_MEM_HARDWALL:
|
||||
retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val);
|
||||
break;
|
||||
case FILE_SCHED_LOAD_BALANCE:
|
||||
retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
|
||||
break;
|
||||
case FILE_MEMORY_MIGRATE:
|
||||
retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val);
|
||||
break;
|
||||
case FILE_MEMORY_PRESSURE_ENABLED:
|
||||
cpuset_memory_pressure_enabled = !!val;
|
||||
break;
|
||||
case FILE_SPREAD_PAGE:
|
||||
retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val);
|
||||
break;
|
||||
case FILE_SPREAD_SLAB:
|
||||
retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val);
|
||||
break;
|
||||
default:
|
||||
retval = -EINVAL;
|
||||
break;
|
||||
}
|
||||
out_unlock:
|
||||
cpuset_unlock();
|
||||
cpus_read_unlock();
|
||||
return retval;
|
||||
}
|
||||
|
||||
/*
|
||||
* for the common functions, 'private' gives the type of file
|
||||
*/
|
||||
|
||||
struct cftype cpuset1_files[] = {
|
||||
{
|
||||
.name = "cpus",
|
||||
.seq_show = cpuset_common_seq_show,
|
||||
.write = cpuset_write_resmask,
|
||||
.max_write_len = (100U + 6 * NR_CPUS),
|
||||
.private = FILE_CPULIST,
|
||||
},
|
||||
|
||||
{
|
||||
.name = "mems",
|
||||
.seq_show = cpuset_common_seq_show,
|
||||
.write = cpuset_write_resmask,
|
||||
.max_write_len = (100U + 6 * MAX_NUMNODES),
|
||||
.private = FILE_MEMLIST,
|
||||
},
|
||||
|
||||
{
|
||||
.name = "effective_cpus",
|
||||
.seq_show = cpuset_common_seq_show,
|
||||
.private = FILE_EFFECTIVE_CPULIST,
|
||||
},
|
||||
|
||||
{
|
||||
.name = "effective_mems",
|
||||
.seq_show = cpuset_common_seq_show,
|
||||
.private = FILE_EFFECTIVE_MEMLIST,
|
||||
},
|
||||
|
||||
{
|
||||
.name = "cpu_exclusive",
|
||||
.read_u64 = cpuset_read_u64,
|
||||
.write_u64 = cpuset_write_u64,
|
||||
.private = FILE_CPU_EXCLUSIVE,
|
||||
},
|
||||
|
||||
{
|
||||
.name = "mem_exclusive",
|
||||
.read_u64 = cpuset_read_u64,
|
||||
.write_u64 = cpuset_write_u64,
|
||||
.private = FILE_MEM_EXCLUSIVE,
|
||||
},
|
||||
|
||||
{
|
||||
.name = "mem_hardwall",
|
||||
.read_u64 = cpuset_read_u64,
|
||||
.write_u64 = cpuset_write_u64,
|
||||
.private = FILE_MEM_HARDWALL,
|
||||
},
|
||||
|
||||
{
|
||||
.name = "sched_load_balance",
|
||||
.read_u64 = cpuset_read_u64,
|
||||
.write_u64 = cpuset_write_u64,
|
||||
.private = FILE_SCHED_LOAD_BALANCE,
|
||||
},
|
||||
|
||||
{
|
||||
.name = "sched_relax_domain_level",
|
||||
.read_s64 = cpuset_read_s64,
|
||||
.write_s64 = cpuset_write_s64,
|
||||
.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
|
||||
},
|
||||
|
||||
{
|
||||
.name = "memory_migrate",
|
||||
.read_u64 = cpuset_read_u64,
|
||||
.write_u64 = cpuset_write_u64,
|
||||
.private = FILE_MEMORY_MIGRATE,
|
||||
},
|
||||
|
||||
{
|
||||
.name = "memory_pressure",
|
||||
.read_u64 = cpuset_read_u64,
|
||||
.private = FILE_MEMORY_PRESSURE,
|
||||
},
|
||||
|
||||
{
|
||||
.name = "memory_spread_page",
|
||||
.read_u64 = cpuset_read_u64,
|
||||
.write_u64 = cpuset_write_u64,
|
||||
.private = FILE_SPREAD_PAGE,
|
||||
},
|
||||
|
||||
{
|
||||
/* obsolete, may be removed in the future */
|
||||
.name = "memory_spread_slab",
|
||||
.read_u64 = cpuset_read_u64,
|
||||
.write_u64 = cpuset_write_u64,
|
||||
.private = FILE_SPREAD_SLAB,
|
||||
},
|
||||
|
||||
{
|
||||
.name = "memory_pressure_enabled",
|
||||
.flags = CFTYPE_ONLY_ON_ROOT,
|
||||
.read_u64 = cpuset_read_u64,
|
||||
.write_u64 = cpuset_write_u64,
|
||||
.private = FILE_MEMORY_PRESSURE_ENABLED,
|
||||
},
|
||||
|
||||
{ } /* terminate */
|
||||
};
|
File diff suppressed because it is too large
Load Diff
|
@ -272,15 +272,10 @@ static void pids_event(struct pids_cgroup *pids_forking,
|
|||
*/
|
||||
static int pids_can_fork(struct task_struct *task, struct css_set *cset)
|
||||
{
|
||||
struct cgroup_subsys_state *css;
|
||||
struct pids_cgroup *pids, *pids_over_limit;
|
||||
int err;
|
||||
|
||||
if (cset)
|
||||
css = cset->subsys[pids_cgrp_id];
|
||||
else
|
||||
css = task_css_check(current, pids_cgrp_id, true);
|
||||
pids = css_pids(css);
|
||||
pids = css_pids(cset->subsys[pids_cgrp_id]);
|
||||
err = pids_try_charge(pids, 1, &pids_over_limit);
|
||||
if (err)
|
||||
pids_event(pids, pids_over_limit);
|
||||
|
@ -290,14 +285,9 @@ static int pids_can_fork(struct task_struct *task, struct css_set *cset)
|
|||
|
||||
static void pids_cancel_fork(struct task_struct *task, struct css_set *cset)
|
||||
{
|
||||
struct cgroup_subsys_state *css;
|
||||
struct pids_cgroup *pids;
|
||||
|
||||
if (cset)
|
||||
css = cset->subsys[pids_cgrp_id];
|
||||
else
|
||||
css = task_css_check(current, pids_cgrp_id, true);
|
||||
pids = css_pids(css);
|
||||
pids = css_pids(cset->subsys[pids_cgrp_id]);
|
||||
pids_uncharge(pids, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -444,6 +444,7 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
|
|||
#ifdef CONFIG_SCHED_CORE
|
||||
dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
|
||||
#endif
|
||||
dst_bstat->ntime += src_bstat->ntime;
|
||||
}
|
||||
|
||||
static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
|
||||
|
@ -455,6 +456,7 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
|
|||
#ifdef CONFIG_SCHED_CORE
|
||||
dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
|
||||
#endif
|
||||
dst_bstat->ntime -= src_bstat->ntime;
|
||||
}
|
||||
|
||||
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
|
||||
|
@ -534,8 +536,10 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
|
|||
rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
|
||||
|
||||
switch (index) {
|
||||
case CPUTIME_USER:
|
||||
case CPUTIME_NICE:
|
||||
rstatc->bstat.ntime += delta_exec;
|
||||
fallthrough;
|
||||
case CPUTIME_USER:
|
||||
rstatc->bstat.cputime.utime += delta_exec;
|
||||
break;
|
||||
case CPUTIME_SYSTEM:
|
||||
|
@ -590,6 +594,7 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
|
|||
#ifdef CONFIG_SCHED_CORE
|
||||
bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
|
||||
#endif
|
||||
bstat->ntime += cpustat[CPUTIME_NICE];
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -607,13 +612,14 @@ static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat
|
|||
void cgroup_base_stat_cputime_show(struct seq_file *seq)
|
||||
{
|
||||
struct cgroup *cgrp = seq_css(seq)->cgroup;
|
||||
u64 usage, utime, stime;
|
||||
u64 usage, utime, stime, ntime;
|
||||
|
||||
if (cgroup_parent(cgrp)) {
|
||||
cgroup_rstat_flush_hold(cgrp);
|
||||
usage = cgrp->bstat.cputime.sum_exec_runtime;
|
||||
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
|
||||
&utime, &stime);
|
||||
ntime = cgrp->bstat.ntime;
|
||||
cgroup_rstat_flush_release(cgrp);
|
||||
} else {
|
||||
/* cgrp->bstat of root is not actually used, reuse it */
|
||||
|
@ -621,16 +627,19 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
|
|||
usage = cgrp->bstat.cputime.sum_exec_runtime;
|
||||
utime = cgrp->bstat.cputime.utime;
|
||||
stime = cgrp->bstat.cputime.stime;
|
||||
ntime = cgrp->bstat.ntime;
|
||||
}
|
||||
|
||||
do_div(usage, NSEC_PER_USEC);
|
||||
do_div(utime, NSEC_PER_USEC);
|
||||
do_div(stime, NSEC_PER_USEC);
|
||||
do_div(ntime, NSEC_PER_USEC);
|
||||
|
||||
seq_printf(seq, "usage_usec %llu\n"
|
||||
"user_usec %llu\n"
|
||||
"system_usec %llu\n",
|
||||
usage, utime, stime);
|
||||
"user_usec %llu\n"
|
||||
"system_usec %llu\n"
|
||||
"nice_usec %llu\n",
|
||||
usage, utime, stime, ntime);
|
||||
|
||||
cgroup_force_idle_show(seq, &cgrp->bstat);
|
||||
}
|
||||
|
|
|
@ -2377,7 +2377,6 @@ static __latent_entropy struct task_struct *copy_process(
|
|||
#endif
|
||||
#ifdef CONFIG_CPUSETS
|
||||
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
|
||||
p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
|
||||
seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
|
||||
#endif
|
||||
#ifdef CONFIG_TRACE_IRQFLAGS
|
||||
|
|
|
@ -1535,7 +1535,7 @@ static int numa_hint_fault_latency(struct folio *folio)
|
|||
int last_time, time;
|
||||
|
||||
time = jiffies_to_msecs(jiffies);
|
||||
last_time = xchg_page_access_time(&folio->page, time);
|
||||
last_time = folio_xchg_access_time(folio, time);
|
||||
|
||||
return (time - last_time) & PAGE_ACCESS_TIME_MASK;
|
||||
}
|
||||
|
@ -1637,7 +1637,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
|
|||
}
|
||||
|
||||
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
|
||||
last_cpupid = page_cpupid_xchg_last(&folio->page, this_cpupid);
|
||||
last_cpupid = folio_xchg_last_cpupid(folio, this_cpupid);
|
||||
|
||||
if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
|
||||
!node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
|
||||
|
|
|
@ -29,6 +29,7 @@ KCOV_INSTRUMENT_page_alloc.o := n
|
|||
KCOV_INSTRUMENT_debug-pagealloc.o := n
|
||||
KCOV_INSTRUMENT_kmemleak.o := n
|
||||
KCOV_INSTRUMENT_memcontrol.o := n
|
||||
KCOV_INSTRUMENT_memcontrol-v1.o := n
|
||||
KCOV_INSTRUMENT_mmzone.o := n
|
||||
KCOV_INSTRUMENT_vmstat.o := n
|
||||
KCOV_INSTRUMENT_failslab.o := n
|
||||
|
@ -96,7 +97,7 @@ obj-$(CONFIG_NUMA) += memory-tiers.o
|
|||
obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
|
||||
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
|
||||
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
|
||||
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
|
||||
obj-$(CONFIG_MEMCG) += memcontrol.o memcontrol-v1.o vmpressure.o
|
||||
ifdef CONFIG_SWAP
|
||||
obj-$(CONFIG_MEMCG) += swap_cgroup.o
|
||||
endif
|
||||
|
|
|
@ -1560,7 +1560,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
|
|||
*/
|
||||
if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) ||
|
||||
node_is_toptier(nid))
|
||||
last_cpupid = page_cpupid_last(&folio->page);
|
||||
last_cpupid = folio_last_cpupid(folio);
|
||||
target_nid = numa_migrate_prep(folio, vma, haddr, nid, &flags);
|
||||
if (target_nid == NUMA_NO_NODE) {
|
||||
folio_put(folio);
|
||||
|
@ -1863,7 +1863,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
|||
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
|
||||
if (is_swap_pmd(*pmd)) {
|
||||
swp_entry_t entry = pmd_to_swp_entry(*pmd);
|
||||
struct page *page = pfn_swap_entry_to_page(entry);
|
||||
struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
|
||||
pmd_t newpmd;
|
||||
|
||||
VM_BUG_ON(!is_pmd_migration_entry(*pmd));
|
||||
|
@ -1872,7 +1872,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
|||
* A protection check is difficult so
|
||||
* just be safe and disable write
|
||||
*/
|
||||
if (PageAnon(page))
|
||||
if (folio_test_anon(folio))
|
||||
entry = make_readable_exclusive_migration_entry(swp_offset(entry));
|
||||
else
|
||||
entry = make_readable_migration_entry(swp_offset(entry));
|
||||
|
@ -1894,7 +1894,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
|||
#endif
|
||||
|
||||
if (prot_numa) {
|
||||
struct page *page;
|
||||
struct folio *folio;
|
||||
bool toptier;
|
||||
/*
|
||||
* Avoid trapping faults against the zero page. The read-only
|
||||
|
@ -1907,8 +1907,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
|||
if (pmd_protnone(*pmd))
|
||||
goto unlock;
|
||||
|
||||
page = pmd_page(*pmd);
|
||||
toptier = node_is_toptier(page_to_nid(page));
|
||||
folio = page_folio(pmd_page(*pmd));
|
||||
toptier = node_is_toptier(folio_nid(folio));
|
||||
/*
|
||||
* Skip scanning top tier node if normal numa
|
||||
* balancing is disabled
|
||||
|
@ -1919,7 +1919,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
|||
|
||||
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
|
||||
!toptier)
|
||||
xchg_page_access_time(page, jiffies_to_msecs(jiffies));
|
||||
folio_xchg_access_time(folio,
|
||||
jiffies_to_msecs(jiffies));
|
||||
}
|
||||
/*
|
||||
* In case prot_numa, we are under mmap_read_lock(mm). It's critical
|
||||
|
@ -2526,7 +2527,7 @@ static void __split_huge_page_tail(struct folio *folio, int tail,
|
|||
if (page_is_idle(head))
|
||||
set_page_idle(page_tail);
|
||||
|
||||
page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
|
||||
folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio));
|
||||
|
||||
/*
|
||||
* always add to the tail because some iterators expect new
|
||||
|
|
|
@ -0,0 +1,345 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#include <linux/memcontrol.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/mm_inline.h>
|
||||
|
||||
#include "memcontrol-v1.h"
|
||||
|
||||
/*
|
||||
* Cgroups above their limits are maintained in a RB-Tree, independent of
|
||||
* their hierarchy representation
|
||||
*/
|
||||
|
||||
struct mem_cgroup_tree_per_node {
|
||||
struct rb_root rb_root;
|
||||
struct rb_node *rb_rightmost;
|
||||
spinlock_t lock;
|
||||
};
|
||||
|
||||
struct mem_cgroup_tree {
|
||||
struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
|
||||
};
|
||||
|
||||
static struct mem_cgroup_tree soft_limit_tree __read_mostly;
|
||||
|
||||
/*
|
||||
* Maximum loops in mem_cgroup_soft_reclaim(), used for soft
|
||||
* limit reclaim to prevent infinite loops, if they ever occur.
|
||||
*/
|
||||
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
|
||||
#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
|
||||
|
||||
static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
|
||||
struct mem_cgroup_tree_per_node *mctz,
|
||||
unsigned long new_usage_in_excess)
|
||||
{
|
||||
struct rb_node **p = &mctz->rb_root.rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
struct mem_cgroup_per_node *mz_node;
|
||||
bool rightmost = true;
|
||||
|
||||
if (mz->on_tree)
|
||||
return;
|
||||
|
||||
mz->usage_in_excess = new_usage_in_excess;
|
||||
if (!mz->usage_in_excess)
|
||||
return;
|
||||
while (*p) {
|
||||
parent = *p;
|
||||
mz_node = rb_entry(parent, struct mem_cgroup_per_node,
|
||||
tree_node);
|
||||
if (mz->usage_in_excess < mz_node->usage_in_excess) {
|
||||
p = &(*p)->rb_left;
|
||||
rightmost = false;
|
||||
} else {
|
||||
p = &(*p)->rb_right;
|
||||
}
|
||||
}
|
||||
|
||||
if (rightmost)
|
||||
mctz->rb_rightmost = &mz->tree_node;
|
||||
|
||||
rb_link_node(&mz->tree_node, parent, p);
|
||||
rb_insert_color(&mz->tree_node, &mctz->rb_root);
|
||||
mz->on_tree = true;
|
||||
}
|
||||
|
||||
static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
|
||||
struct mem_cgroup_tree_per_node *mctz)
|
||||
{
|
||||
if (!mz->on_tree)
|
||||
return;
|
||||
|
||||
if (&mz->tree_node == mctz->rb_rightmost)
|
||||
mctz->rb_rightmost = rb_prev(&mz->tree_node);
|
||||
|
||||
rb_erase(&mz->tree_node, &mctz->rb_root);
|
||||
mz->on_tree = false;
|
||||
}
|
||||
|
||||
static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
|
||||
struct mem_cgroup_tree_per_node *mctz)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&mctz->lock, flags);
|
||||
__mem_cgroup_remove_exceeded(mz, mctz);
|
||||
spin_unlock_irqrestore(&mctz->lock, flags);
|
||||
}
|
||||
|
||||
static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
|
||||
{
|
||||
unsigned long nr_pages = page_counter_read(&memcg->memory);
|
||||
unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
|
||||
unsigned long excess = 0;
|
||||
|
||||
if (nr_pages > soft_limit)
|
||||
excess = nr_pages - soft_limit;
|
||||
|
||||
return excess;
|
||||
}
|
||||
|
||||
void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
|
||||
{
|
||||
unsigned long excess;
|
||||
struct mem_cgroup_per_node *mz;
|
||||
struct mem_cgroup_tree_per_node *mctz;
|
||||
|
||||
if (lru_gen_enabled()) {
|
||||
if (soft_limit_excess(memcg))
|
||||
lru_gen_soft_reclaim(memcg, nid);
|
||||
return;
|
||||
}
|
||||
|
||||
mctz = soft_limit_tree.rb_tree_per_node[nid];
|
||||
if (!mctz)
|
||||
return;
|
||||
/*
|
||||
* Necessary to update all ancestors when hierarchy is used.
|
||||
* because their event counter is not touched.
|
||||
*/
|
||||
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
|
||||
mz = memcg->nodeinfo[nid];
|
||||
excess = soft_limit_excess(memcg);
|
||||
/*
|
||||
* We have to update the tree if mz is on RB-tree or
|
||||
* mem is over its softlimit.
|
||||
*/
|
||||
if (excess || mz->on_tree) {
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&mctz->lock, flags);
|
||||
/* if on-tree, remove it */
|
||||
if (mz->on_tree)
|
||||
__mem_cgroup_remove_exceeded(mz, mctz);
|
||||
/*
|
||||
* Insert again. mz->usage_in_excess will be updated.
|
||||
* If excess is 0, no tree ops.
|
||||
*/
|
||||
__mem_cgroup_insert_exceeded(mz, mctz, excess);
|
||||
spin_unlock_irqrestore(&mctz->lock, flags);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
|
||||
{
|
||||
struct mem_cgroup_tree_per_node *mctz;
|
||||
struct mem_cgroup_per_node *mz;
|
||||
int nid;
|
||||
|
||||
for_each_node(nid) {
|
||||
mz = memcg->nodeinfo[nid];
|
||||
mctz = soft_limit_tree.rb_tree_per_node[nid];
|
||||
if (mctz)
|
||||
mem_cgroup_remove_exceeded(mz, mctz);
|
||||
}
|
||||
}
|
||||
|
||||
static struct mem_cgroup_per_node *
|
||||
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
|
||||
{
|
||||
struct mem_cgroup_per_node *mz;
|
||||
|
||||
retry:
|
||||
mz = NULL;
|
||||
if (!mctz->rb_rightmost)
|
||||
goto done; /* Nothing to reclaim from */
|
||||
|
||||
mz = rb_entry(mctz->rb_rightmost,
|
||||
struct mem_cgroup_per_node, tree_node);
|
||||
/*
|
||||
* Remove the node now but someone else can add it back,
|
||||
* we will to add it back at the end of reclaim to its correct
|
||||
* position in the tree.
|
||||
*/
|
||||
__mem_cgroup_remove_exceeded(mz, mctz);
|
||||
if (!soft_limit_excess(mz->memcg) ||
|
||||
!css_tryget(&mz->memcg->css))
|
||||
goto retry;
|
||||
done:
|
||||
return mz;
|
||||
}
|
||||
|
||||
static struct mem_cgroup_per_node *
|
||||
mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
|
||||
{
|
||||
struct mem_cgroup_per_node *mz;
|
||||
|
||||
spin_lock_irq(&mctz->lock);
|
||||
mz = __mem_cgroup_largest_soft_limit_node(mctz);
|
||||
spin_unlock_irq(&mctz->lock);
|
||||
return mz;
|
||||
}
|
||||
|
||||
static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
|
||||
pg_data_t *pgdat,
|
||||
gfp_t gfp_mask,
|
||||
unsigned long *total_scanned)
|
||||
{
|
||||
struct mem_cgroup *victim = NULL;
|
||||
int total = 0;
|
||||
int loop = 0;
|
||||
unsigned long excess;
|
||||
unsigned long nr_scanned;
|
||||
struct mem_cgroup_reclaim_cookie reclaim = {
|
||||
.pgdat = pgdat,
|
||||
};
|
||||
|
||||
excess = soft_limit_excess(root_memcg);
|
||||
|
||||
while (1) {
|
||||
victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
|
||||
if (!victim) {
|
||||
loop++;
|
||||
if (loop >= 2) {
|
||||
/*
|
||||
* If we have not been able to reclaim
|
||||
* anything, it might because there are
|
||||
* no reclaimable pages under this hierarchy
|
||||
*/
|
||||
if (!total)
|
||||
break;
|
||||
/*
|
||||
* We want to do more targeted reclaim.
|
||||
* excess >> 2 is not to excessive so as to
|
||||
* reclaim too much, nor too less that we keep
|
||||
* coming back to reclaim from this cgroup
|
||||
*/
|
||||
if (total >= (excess >> 2) ||
|
||||
(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
|
||||
break;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
total += mem_cgroup_shrink_node(victim, gfp_mask, false,
|
||||
pgdat, &nr_scanned);
|
||||
*total_scanned += nr_scanned;
|
||||
if (!soft_limit_excess(root_memcg))
|
||||
break;
|
||||
}
|
||||
mem_cgroup_iter_break(root_memcg, victim);
|
||||
return total;
|
||||
}
|
||||
|
||||
unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
|
||||
gfp_t gfp_mask,
|
||||
unsigned long *total_scanned)
|
||||
{
|
||||
unsigned long nr_reclaimed = 0;
|
||||
struct mem_cgroup_per_node *mz, *next_mz = NULL;
|
||||
unsigned long reclaimed;
|
||||
int loop = 0;
|
||||
struct mem_cgroup_tree_per_node *mctz;
|
||||
unsigned long excess;
|
||||
|
||||
if (lru_gen_enabled())
|
||||
return 0;
|
||||
|
||||
if (order > 0)
|
||||
return 0;
|
||||
|
||||
mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];
|
||||
|
||||
/*
|
||||
* Do not even bother to check the largest node if the root
|
||||
* is empty. Do it lockless to prevent lock bouncing. Races
|
||||
* are acceptable as soft limit is best effort anyway.
|
||||
*/
|
||||
if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* This loop can run a while, specially if mem_cgroup's continuously
|
||||
* keep exceeding their soft limit and putting the system under
|
||||
* pressure
|
||||
*/
|
||||
do {
|
||||
if (next_mz)
|
||||
mz = next_mz;
|
||||
else
|
||||
mz = mem_cgroup_largest_soft_limit_node(mctz);
|
||||
if (!mz)
|
||||
break;
|
||||
|
||||
reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
|
||||
gfp_mask, total_scanned);
|
||||
nr_reclaimed += reclaimed;
|
||||
spin_lock_irq(&mctz->lock);
|
||||
|
||||
/*
|
||||
* If we failed to reclaim anything from this memory cgroup
|
||||
* it is time to move on to the next cgroup
|
||||
*/
|
||||
next_mz = NULL;
|
||||
if (!reclaimed)
|
||||
next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
|
||||
|
||||
excess = soft_limit_excess(mz->memcg);
|
||||
/*
|
||||
* One school of thought says that we should not add
|
||||
* back the node to the tree if reclaim returns 0.
|
||||
* But our reclaim could return 0, simply because due
|
||||
* to priority we are exposing a smaller subset of
|
||||
* memory to reclaim from. Consider this as a longer
|
||||
* term TODO.
|
||||
*/
|
||||
/* If excess == 0, no tree ops */
|
||||
__mem_cgroup_insert_exceeded(mz, mctz, excess);
|
||||
spin_unlock_irq(&mctz->lock);
|
||||
css_put(&mz->memcg->css);
|
||||
loop++;
|
||||
/*
|
||||
* Could not reclaim anything and there are no more
|
||||
* mem cgroups to try or we seem to be looping without
|
||||
* reclaiming anything.
|
||||
*/
|
||||
if (!nr_reclaimed &&
|
||||
(next_mz == NULL ||
|
||||
loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
|
||||
break;
|
||||
} while (!nr_reclaimed);
|
||||
if (next_mz)
|
||||
css_put(&next_mz->memcg->css);
|
||||
return nr_reclaimed;
|
||||
}
|
||||
|
||||
static int __init memcg1_init(void)
|
||||
{
|
||||
int node;
|
||||
|
||||
for_each_node(node) {
|
||||
struct mem_cgroup_tree_per_node *rtpn;
|
||||
|
||||
rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node);
|
||||
|
||||
rtpn->rb_root = RB_ROOT;
|
||||
rtpn->rb_rightmost = NULL;
|
||||
spin_lock_init(&rtpn->lock);
|
||||
soft_limit_tree.rb_tree_per_node[node] = rtpn;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
subsys_initcall(memcg1_init);
|
|
@ -0,0 +1,14 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
|
||||
#ifndef __MM_MEMCONTROL_V1_H
|
||||
#define __MM_MEMCONTROL_V1_H
|
||||
|
||||
void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid);
|
||||
void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg);
|
||||
|
||||
static inline void memcg1_soft_limit_reset(struct mem_cgroup *memcg)
|
||||
{
|
||||
WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX);
|
||||
}
|
||||
|
||||
#endif /* __MM_MEMCONTROL_V1_H */
|
337
mm/memcontrol.c
337
mm/memcontrol.c
|
@ -70,6 +70,7 @@
|
|||
#include <net/ip.h>
|
||||
#include "slab.h"
|
||||
#include "swap.h"
|
||||
#include "memcontrol-v1.h"
|
||||
#include <linux/zswap.h>
|
||||
|
||||
#include <linux/uaccess.h>
|
||||
|
@ -107,23 +108,6 @@ static bool do_memsw_account(void)
|
|||
#define THRESHOLDS_EVENTS_TARGET 128
|
||||
#define SOFTLIMIT_EVENTS_TARGET 1024
|
||||
|
||||
/*
|
||||
* Cgroups above their limits are maintained in a RB-Tree, independent of
|
||||
* their hierarchy representation
|
||||
*/
|
||||
|
||||
struct mem_cgroup_tree_per_node {
|
||||
struct rb_root rb_root;
|
||||
struct rb_node *rb_rightmost;
|
||||
spinlock_t lock;
|
||||
};
|
||||
|
||||
struct mem_cgroup_tree {
|
||||
struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
|
||||
};
|
||||
|
||||
static struct mem_cgroup_tree soft_limit_tree __read_mostly;
|
||||
|
||||
/* for OOM */
|
||||
struct mem_cgroup_eventfd_list {
|
||||
struct list_head list;
|
||||
|
@ -198,13 +182,6 @@ static struct move_charge_struct {
|
|||
.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
|
||||
};
|
||||
|
||||
/*
|
||||
* Maximum loops in mem_cgroup_soft_reclaim(), used for soft
|
||||
* limit reclaim to prevent infinite loops, if they ever occur.
|
||||
*/
|
||||
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
|
||||
#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
|
||||
|
||||
/* for encoding cft->private value on file */
|
||||
enum res_type {
|
||||
_MEM,
|
||||
|
@ -420,169 +397,6 @@ ino_t page_cgroup_ino(struct page *page)
|
|||
return ino;
|
||||
}
|
||||
|
||||
static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
|
||||
struct mem_cgroup_tree_per_node *mctz,
|
||||
unsigned long new_usage_in_excess)
|
||||
{
|
||||
struct rb_node **p = &mctz->rb_root.rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
struct mem_cgroup_per_node *mz_node;
|
||||
bool rightmost = true;
|
||||
|
||||
if (mz->on_tree)
|
||||
return;
|
||||
|
||||
mz->usage_in_excess = new_usage_in_excess;
|
||||
if (!mz->usage_in_excess)
|
||||
return;
|
||||
while (*p) {
|
||||
parent = *p;
|
||||
mz_node = rb_entry(parent, struct mem_cgroup_per_node,
|
||||
tree_node);
|
||||
if (mz->usage_in_excess < mz_node->usage_in_excess) {
|
||||
p = &(*p)->rb_left;
|
||||
rightmost = false;
|
||||
} else {
|
||||
p = &(*p)->rb_right;
|
||||
}
|
||||
}
|
||||
|
||||
if (rightmost)
|
||||
mctz->rb_rightmost = &mz->tree_node;
|
||||
|
||||
rb_link_node(&mz->tree_node, parent, p);
|
||||
rb_insert_color(&mz->tree_node, &mctz->rb_root);
|
||||
mz->on_tree = true;
|
||||
}
|
||||
|
||||
static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
|
||||
struct mem_cgroup_tree_per_node *mctz)
|
||||
{
|
||||
if (!mz->on_tree)
|
||||
return;
|
||||
|
||||
if (&mz->tree_node == mctz->rb_rightmost)
|
||||
mctz->rb_rightmost = rb_prev(&mz->tree_node);
|
||||
|
||||
rb_erase(&mz->tree_node, &mctz->rb_root);
|
||||
mz->on_tree = false;
|
||||
}
|
||||
|
||||
static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
|
||||
struct mem_cgroup_tree_per_node *mctz)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&mctz->lock, flags);
|
||||
__mem_cgroup_remove_exceeded(mz, mctz);
|
||||
spin_unlock_irqrestore(&mctz->lock, flags);
|
||||
}
|
||||
|
||||
static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
|
||||
{
|
||||
unsigned long nr_pages = page_counter_read(&memcg->memory);
|
||||
unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
|
||||
unsigned long excess = 0;
|
||||
|
||||
if (nr_pages > soft_limit)
|
||||
excess = nr_pages - soft_limit;
|
||||
|
||||
return excess;
|
||||
}
|
||||
|
||||
static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
|
||||
{
|
||||
unsigned long excess;
|
||||
struct mem_cgroup_per_node *mz;
|
||||
struct mem_cgroup_tree_per_node *mctz;
|
||||
|
||||
if (lru_gen_enabled()) {
|
||||
if (soft_limit_excess(memcg))
|
||||
lru_gen_soft_reclaim(memcg, nid);
|
||||
return;
|
||||
}
|
||||
|
||||
mctz = soft_limit_tree.rb_tree_per_node[nid];
|
||||
if (!mctz)
|
||||
return;
|
||||
/*
|
||||
* Necessary to update all ancestors when hierarchy is used.
|
||||
* because their event counter is not touched.
|
||||
*/
|
||||
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
|
||||
mz = memcg->nodeinfo[nid];
|
||||
excess = soft_limit_excess(memcg);
|
||||
/*
|
||||
* We have to update the tree if mz is on RB-tree or
|
||||
* mem is over its softlimit.
|
||||
*/
|
||||
if (excess || mz->on_tree) {
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&mctz->lock, flags);
|
||||
/* if on-tree, remove it */
|
||||
if (mz->on_tree)
|
||||
__mem_cgroup_remove_exceeded(mz, mctz);
|
||||
/*
|
||||
* Insert again. mz->usage_in_excess will be updated.
|
||||
* If excess is 0, no tree ops.
|
||||
*/
|
||||
__mem_cgroup_insert_exceeded(mz, mctz, excess);
|
||||
spin_unlock_irqrestore(&mctz->lock, flags);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
|
||||
{
|
||||
struct mem_cgroup_tree_per_node *mctz;
|
||||
struct mem_cgroup_per_node *mz;
|
||||
int nid;
|
||||
|
||||
for_each_node(nid) {
|
||||
mz = memcg->nodeinfo[nid];
|
||||
mctz = soft_limit_tree.rb_tree_per_node[nid];
|
||||
if (mctz)
|
||||
mem_cgroup_remove_exceeded(mz, mctz);
|
||||
}
|
||||
}
|
||||
|
||||
static struct mem_cgroup_per_node *
|
||||
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
|
||||
{
|
||||
struct mem_cgroup_per_node *mz;
|
||||
|
||||
retry:
|
||||
mz = NULL;
|
||||
if (!mctz->rb_rightmost)
|
||||
goto done; /* Nothing to reclaim from */
|
||||
|
||||
mz = rb_entry(mctz->rb_rightmost,
|
||||
struct mem_cgroup_per_node, tree_node);
|
||||
/*
|
||||
* Remove the node now but someone else can add it back,
|
||||
* we will to add it back at the end of reclaim to its correct
|
||||
* position in the tree.
|
||||
*/
|
||||
__mem_cgroup_remove_exceeded(mz, mctz);
|
||||
if (!soft_limit_excess(mz->memcg) ||
|
||||
!css_tryget(&mz->memcg->css))
|
||||
goto retry;
|
||||
done:
|
||||
return mz;
|
||||
}
|
||||
|
||||
static struct mem_cgroup_per_node *
|
||||
mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
|
||||
{
|
||||
struct mem_cgroup_per_node *mz;
|
||||
|
||||
spin_lock_irq(&mctz->lock);
|
||||
mz = __mem_cgroup_largest_soft_limit_node(mctz);
|
||||
spin_unlock_irq(&mctz->lock);
|
||||
return mz;
|
||||
}
|
||||
|
||||
/*
|
||||
* memcg and lruvec stats flushing
|
||||
*
|
||||
|
@ -1846,56 +1660,6 @@ unlock:
|
|||
return ret;
|
||||
}
|
||||
|
||||
static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
|
||||
pg_data_t *pgdat,
|
||||
gfp_t gfp_mask,
|
||||
unsigned long *total_scanned)
|
||||
{
|
||||
struct mem_cgroup *victim = NULL;
|
||||
int total = 0;
|
||||
int loop = 0;
|
||||
unsigned long excess;
|
||||
unsigned long nr_scanned;
|
||||
struct mem_cgroup_reclaim_cookie reclaim = {
|
||||
.pgdat = pgdat,
|
||||
};
|
||||
|
||||
excess = soft_limit_excess(root_memcg);
|
||||
|
||||
while (1) {
|
||||
victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
|
||||
if (!victim) {
|
||||
loop++;
|
||||
if (loop >= 2) {
|
||||
/*
|
||||
* If we have not been able to reclaim
|
||||
* anything, it might because there are
|
||||
* no reclaimable pages under this hierarchy
|
||||
*/
|
||||
if (!total)
|
||||
break;
|
||||
/*
|
||||
* We want to do more targeted reclaim.
|
||||
* excess >> 2 is not to excessive so as to
|
||||
* reclaim too much, nor too less that we keep
|
||||
* coming back to reclaim from this cgroup
|
||||
*/
|
||||
if (total >= (excess >> 2) ||
|
||||
(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
|
||||
break;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
total += mem_cgroup_shrink_node(victim, gfp_mask, false,
|
||||
pgdat, &nr_scanned);
|
||||
*total_scanned += nr_scanned;
|
||||
if (!soft_limit_excess(root_memcg))
|
||||
break;
|
||||
}
|
||||
mem_cgroup_iter_break(root_memcg, victim);
|
||||
return total;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_LOCKDEP
|
||||
static struct lockdep_map memcg_oom_lock_dep_map = {
|
||||
.name = "memcg_oom_lock",
|
||||
|
@ -3744,88 +3508,6 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
|
|||
return ret;
|
||||
}
|
||||
|
||||
unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
|
||||
gfp_t gfp_mask,
|
||||
unsigned long *total_scanned)
|
||||
{
|
||||
unsigned long nr_reclaimed = 0;
|
||||
struct mem_cgroup_per_node *mz, *next_mz = NULL;
|
||||
unsigned long reclaimed;
|
||||
int loop = 0;
|
||||
struct mem_cgroup_tree_per_node *mctz;
|
||||
unsigned long excess;
|
||||
|
||||
if (lru_gen_enabled())
|
||||
return 0;
|
||||
|
||||
if (order > 0)
|
||||
return 0;
|
||||
|
||||
mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];
|
||||
|
||||
/*
|
||||
* Do not even bother to check the largest node if the root
|
||||
* is empty. Do it lockless to prevent lock bouncing. Races
|
||||
* are acceptable as soft limit is best effort anyway.
|
||||
*/
|
||||
if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* This loop can run a while, specially if mem_cgroup's continuously
|
||||
* keep exceeding their soft limit and putting the system under
|
||||
* pressure
|
||||
*/
|
||||
do {
|
||||
if (next_mz)
|
||||
mz = next_mz;
|
||||
else
|
||||
mz = mem_cgroup_largest_soft_limit_node(mctz);
|
||||
if (!mz)
|
||||
break;
|
||||
|
||||
reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
|
||||
gfp_mask, total_scanned);
|
||||
nr_reclaimed += reclaimed;
|
||||
spin_lock_irq(&mctz->lock);
|
||||
|
||||
/*
|
||||
* If we failed to reclaim anything from this memory cgroup
|
||||
* it is time to move on to the next cgroup
|
||||
*/
|
||||
next_mz = NULL;
|
||||
if (!reclaimed)
|
||||
next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
|
||||
|
||||
excess = soft_limit_excess(mz->memcg);
|
||||
/*
|
||||
* One school of thought says that we should not add
|
||||
* back the node to the tree if reclaim returns 0.
|
||||
* But our reclaim could return 0, simply because due
|
||||
* to priority we are exposing a smaller subset of
|
||||
* memory to reclaim from. Consider this as a longer
|
||||
* term TODO.
|
||||
*/
|
||||
/* If excess == 0, no tree ops */
|
||||
__mem_cgroup_insert_exceeded(mz, mctz, excess);
|
||||
spin_unlock_irq(&mctz->lock);
|
||||
css_put(&mz->memcg->css);
|
||||
loop++;
|
||||
/*
|
||||
* Could not reclaim anything and there are no more
|
||||
* mem cgroups to try or we seem to be looping without
|
||||
* reclaiming anything.
|
||||
*/
|
||||
if (!nr_reclaimed &&
|
||||
(next_mz == NULL ||
|
||||
loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
|
||||
break;
|
||||
} while (!nr_reclaimed);
|
||||
if (next_mz)
|
||||
css_put(&next_mz->memcg->css);
|
||||
return nr_reclaimed;
|
||||
}
|
||||
|
||||
/*
|
||||
* Reclaims as many pages from the given memcg as possible.
|
||||
*
|
||||
|
@ -5668,7 +5350,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
|
|||
return ERR_CAST(memcg);
|
||||
|
||||
page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
|
||||
WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX);
|
||||
memcg1_soft_limit_reset(memcg);
|
||||
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
|
||||
memcg->zswap_max = PAGE_COUNTER_MAX;
|
||||
WRITE_ONCE(memcg->zswap_writeback, true);
|
||||
|
@ -5841,7 +5523,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
|
|||
page_counter_set_min(&memcg->memory, 0);
|
||||
page_counter_set_low(&memcg->memory, 0);
|
||||
page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
|
||||
WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX);
|
||||
memcg1_soft_limit_reset(memcg);
|
||||
page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
|
||||
memcg_wb_domain_size_changed(memcg);
|
||||
}
|
||||
|
@ -7810,7 +7492,7 @@ __setup("cgroup.memory=", cgroup_memory);
|
|||
*/
|
||||
static int __init mem_cgroup_init(void)
|
||||
{
|
||||
int cpu, node;
|
||||
int cpu;
|
||||
|
||||
/*
|
||||
* Currently s32 type (can refer to struct batched_lruvec_stat) is
|
||||
|
@ -7827,17 +7509,6 @@ static int __init mem_cgroup_init(void)
|
|||
INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
|
||||
drain_local_stock);
|
||||
|
||||
for_each_node(node) {
|
||||
struct mem_cgroup_tree_per_node *rtpn;
|
||||
|
||||
rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node);
|
||||
|
||||
rtpn->rb_root = RB_ROOT;
|
||||
rtpn->rb_rightmost = NULL;
|
||||
spin_lock_init(&rtpn->lock);
|
||||
soft_limit_tree.rb_tree_per_node[node] = rtpn;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
subsys_initcall(mem_cgroup_init);
|
||||
|
|
38
mm/memory.c
38
mm/memory.c
|
@ -3036,23 +3036,24 @@ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
|
|||
* case, all we need to do here is to mark the page as writable and update
|
||||
* any related book-keeping.
|
||||
*/
|
||||
static inline void wp_page_reuse(struct vm_fault *vmf)
|
||||
static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio)
|
||||
__releases(vmf->ptl)
|
||||
{
|
||||
struct vm_area_struct *vma = vmf->vma;
|
||||
struct page *page = vmf->page;
|
||||
pte_t entry;
|
||||
|
||||
VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE));
|
||||
VM_BUG_ON(page && PageAnon(page) && !PageAnonExclusive(page));
|
||||
|
||||
/*
|
||||
* Clear the pages cpupid information as the existing
|
||||
* information potentially belongs to a now completely
|
||||
* unrelated process.
|
||||
*/
|
||||
if (page)
|
||||
page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
|
||||
if (folio) {
|
||||
VM_BUG_ON(folio_test_anon(folio) &&
|
||||
!PageAnonExclusive(vmf->page));
|
||||
/*
|
||||
* Clear the folio's cpupid information as the existing
|
||||
* information potentially belongs to a now completely
|
||||
* unrelated process.
|
||||
*/
|
||||
folio_xchg_last_cpupid(folio, (1 << LAST_CPUPID_SHIFT) - 1);
|
||||
}
|
||||
|
||||
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
|
||||
entry = pte_mkyoung(vmf->orig_pte);
|
||||
|
@ -3245,6 +3246,7 @@ oom:
|
|||
* writeable once the page is prepared
|
||||
*
|
||||
* @vmf: structure describing the fault
|
||||
* @folio: the folio of vmf->page
|
||||
*
|
||||
* This function handles all that is needed to finish a write page fault in a
|
||||
* shared mapping due to PTE being read-only once the mapped page is prepared.
|
||||
|
@ -3256,7 +3258,7 @@ oom:
|
|||
* Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before
|
||||
* we acquired PTE lock.
|
||||
*/
|
||||
vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
|
||||
static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf, struct folio *folio)
|
||||
{
|
||||
WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
|
||||
vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
|
||||
|
@ -3272,7 +3274,7 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
|
|||
pte_unmap_unlock(vmf->pte, vmf->ptl);
|
||||
return VM_FAULT_NOPAGE;
|
||||
}
|
||||
wp_page_reuse(vmf);
|
||||
wp_page_reuse(vmf, folio);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -3297,9 +3299,9 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
|
|||
ret = vma->vm_ops->pfn_mkwrite(vmf);
|
||||
if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
|
||||
return ret;
|
||||
return finish_mkwrite_fault(vmf);
|
||||
return finish_mkwrite_fault(vmf, NULL);
|
||||
}
|
||||
wp_page_reuse(vmf);
|
||||
wp_page_reuse(vmf, NULL);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -3327,14 +3329,14 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio)
|
|||
folio_put(folio);
|
||||
return tmp;
|
||||
}
|
||||
tmp = finish_mkwrite_fault(vmf);
|
||||
tmp = finish_mkwrite_fault(vmf, folio);
|
||||
if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
|
||||
folio_unlock(folio);
|
||||
folio_put(folio);
|
||||
return tmp;
|
||||
}
|
||||
} else {
|
||||
wp_page_reuse(vmf);
|
||||
wp_page_reuse(vmf, folio);
|
||||
folio_lock(folio);
|
||||
}
|
||||
ret |= fault_dirty_shared_page(vmf);
|
||||
|
@ -3458,7 +3460,7 @@ reuse:
|
|||
pte_unmap_unlock(vmf->pte, vmf->ptl);
|
||||
return 0;
|
||||
}
|
||||
wp_page_reuse(vmf);
|
||||
wp_page_reuse(vmf, folio);
|
||||
return 0;
|
||||
}
|
||||
copy:
|
||||
|
@ -4866,7 +4868,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
|
|||
!node_is_toptier(nid))
|
||||
last_cpupid = (-1 & LAST_CPUPID_MASK);
|
||||
else
|
||||
last_cpupid = page_cpupid_last(&folio->page);
|
||||
last_cpupid = folio_last_cpupid(folio);
|
||||
target_nid = numa_migrate_prep(folio, vma, vmf->address, nid, &flags);
|
||||
if (target_nid == NUMA_NO_NODE) {
|
||||
folio_put(folio);
|
||||
|
|
|
@ -606,20 +606,20 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
|
|||
* Copy NUMA information to the new page, to prevent over-eager
|
||||
* future migrations of this same page.
|
||||
*/
|
||||
cpupid = page_cpupid_xchg_last(&folio->page, -1);
|
||||
cpupid = folio_xchg_last_cpupid(folio, -1);
|
||||
/*
|
||||
* For memory tiering mode, when migrate between slow and fast
|
||||
* memory node, reset cpupid, because that is used to record
|
||||
* page access time in slow memory node.
|
||||
*/
|
||||
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) {
|
||||
bool f_toptier = node_is_toptier(page_to_nid(&folio->page));
|
||||
bool t_toptier = node_is_toptier(page_to_nid(&newfolio->page));
|
||||
bool f_toptier = node_is_toptier(folio_nid(folio));
|
||||
bool t_toptier = node_is_toptier(folio_nid(newfolio));
|
||||
|
||||
if (f_toptier != t_toptier)
|
||||
cpupid = -1;
|
||||
}
|
||||
page_cpupid_xchg_last(&newfolio->page, cpupid);
|
||||
folio_xchg_last_cpupid(newfolio, cpupid);
|
||||
|
||||
folio_migrate_ksm(newfolio, folio);
|
||||
/*
|
||||
|
|
|
@ -94,19 +94,19 @@ void lruvec_init(struct lruvec *lruvec)
|
|||
}
|
||||
|
||||
#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
|
||||
int page_cpupid_xchg_last(struct page *page, int cpupid)
|
||||
int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
|
||||
{
|
||||
unsigned long old_flags, flags;
|
||||
int last_cpupid;
|
||||
|
||||
old_flags = READ_ONCE(page->flags);
|
||||
old_flags = READ_ONCE(folio->flags);
|
||||
do {
|
||||
flags = old_flags;
|
||||
last_cpupid = (flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
|
||||
|
||||
flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
|
||||
flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
|
||||
} while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags)));
|
||||
} while (unlikely(!try_cmpxchg(&folio->flags, &old_flags, flags)));
|
||||
|
||||
return last_cpupid;
|
||||
}
|
||||
|
|
|
@ -114,7 +114,7 @@ static long change_pte_range(struct mmu_gather *tlb,
|
|||
* pages. See similar comment in change_huge_pmd.
|
||||
*/
|
||||
if (prot_numa) {
|
||||
struct page *page;
|
||||
struct folio *folio;
|
||||
int nid;
|
||||
bool toptier;
|
||||
|
||||
|
@ -122,13 +122,14 @@ static long change_pte_range(struct mmu_gather *tlb,
|
|||
if (pte_protnone(oldpte))
|
||||
continue;
|
||||
|
||||
page = vm_normal_page(vma, addr, oldpte);
|
||||
if (!page || is_zone_device_page(page) || PageKsm(page))
|
||||
folio = vm_normal_folio(vma, addr, oldpte);
|
||||
if (!folio || folio_is_zone_device(folio) ||
|
||||
folio_test_ksm(folio))
|
||||
continue;
|
||||
|
||||
/* Also skip shared copy-on-write pages */
|
||||
if (is_cow_mapping(vma->vm_flags) &&
|
||||
page_count(page) != 1)
|
||||
folio_ref_count(folio) != 1)
|
||||
continue;
|
||||
|
||||
/*
|
||||
|
@ -136,14 +137,15 @@ static long change_pte_range(struct mmu_gather *tlb,
|
|||
* it cannot move them all from MIGRATE_ASYNC
|
||||
* context.
|
||||
*/
|
||||
if (page_is_file_lru(page) && PageDirty(page))
|
||||
if (folio_is_file_lru(folio) &&
|
||||
folio_test_dirty(folio))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Don't mess with PTEs if page is already on the node
|
||||
* a single-threaded process is running on.
|
||||
*/
|
||||
nid = page_to_nid(page);
|
||||
nid = folio_nid(folio);
|
||||
if (target_node == nid)
|
||||
continue;
|
||||
toptier = node_is_toptier(nid);
|
||||
|
@ -157,7 +159,7 @@ static long change_pte_range(struct mmu_gather *tlb,
|
|||
continue;
|
||||
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
|
||||
!toptier)
|
||||
xchg_page_access_time(page,
|
||||
folio_xchg_access_time(folio,
|
||||
jiffies_to_msecs(jiffies));
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
CONFIG_CPUSETS_V1=y
|
|
@ -8,6 +8,7 @@
|
|||
#include <pthread.h>
|
||||
#include <stdio.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "../kselftest.h"
|
||||
#include "cgroup_util.h"
|
||||
|
@ -229,6 +230,79 @@ cleanup:
|
|||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Creates a nice process that consumes CPU and checks that the elapsed
|
||||
* usertime in the cgroup is close to the expected time.
|
||||
*/
|
||||
static int test_cpucg_nice(const char *root)
|
||||
{
|
||||
int ret = KSFT_FAIL;
|
||||
int status;
|
||||
long user_usec, nice_usec;
|
||||
long usage_seconds = 2;
|
||||
long expected_nice_usec = usage_seconds * USEC_PER_SEC;
|
||||
char *cpucg;
|
||||
pid_t pid;
|
||||
|
||||
cpucg = cg_name(root, "cpucg_test");
|
||||
if (!cpucg)
|
||||
goto cleanup;
|
||||
|
||||
if (cg_create(cpucg))
|
||||
goto cleanup;
|
||||
|
||||
user_usec = cg_read_key_long(cpucg, "cpu.stat", "user_usec");
|
||||
nice_usec = cg_read_key_long(cpucg, "cpu.stat", "nice_usec");
|
||||
if (nice_usec == -1)
|
||||
ret = KSFT_SKIP;
|
||||
if (user_usec != 0 || nice_usec != 0)
|
||||
goto cleanup;
|
||||
|
||||
/*
|
||||
* We fork here to create a new process that can be niced without
|
||||
* polluting the nice value of other selftests
|
||||
*/
|
||||
pid = fork();
|
||||
if (pid < 0) {
|
||||
goto cleanup;
|
||||
} else if (pid == 0) {
|
||||
struct cpu_hog_func_param param = {
|
||||
.nprocs = 1,
|
||||
.ts = {
|
||||
.tv_sec = usage_seconds,
|
||||
.tv_nsec = 0,
|
||||
},
|
||||
.clock_type = CPU_HOG_CLOCK_PROCESS,
|
||||
};
|
||||
char buf[64];
|
||||
snprintf(buf, sizeof(buf), "%d", getpid());
|
||||
if (cg_write(cpucg, "cgroup.procs", buf))
|
||||
goto cleanup;
|
||||
|
||||
/* Try to keep niced CPU usage as constrained to hog_cpu as possible */
|
||||
nice(1);
|
||||
hog_cpus_timed(cpucg, ¶m);
|
||||
exit(0);
|
||||
} else {
|
||||
waitpid(pid, &status, 0);
|
||||
if (!WIFEXITED(status))
|
||||
goto cleanup;
|
||||
|
||||
user_usec = cg_read_key_long(cpucg, "cpu.stat", "user_usec");
|
||||
nice_usec = cg_read_key_long(cpucg, "cpu.stat", "nice_usec");
|
||||
if (!values_close(nice_usec, expected_nice_usec, 1))
|
||||
goto cleanup;
|
||||
|
||||
ret = KSFT_PASS;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
cg_destroy(cpucg);
|
||||
free(cpucg);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int
|
||||
run_cpucg_weight_test(
|
||||
const char *root,
|
||||
|
@ -686,6 +760,7 @@ struct cpucg_test {
|
|||
} tests[] = {
|
||||
T(test_cpucg_subtree_control),
|
||||
T(test_cpucg_stats),
|
||||
T(test_cpucg_nice),
|
||||
T(test_cpucg_weight_overprovisioned),
|
||||
T(test_cpucg_weight_underprovisioned),
|
||||
T(test_cpucg_nested_weight_overprovisioned),
|
||||
|
|
|
@ -0,0 +1,77 @@
|
|||
#!/bin/bash
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
#
|
||||
# Basc test for cpuset v1 interfaces write/read
|
||||
#
|
||||
|
||||
skip_test() {
|
||||
echo "$1"
|
||||
echo "Test SKIPPED"
|
||||
exit 4 # ksft_skip
|
||||
}
|
||||
|
||||
write_test() {
|
||||
dir=$1
|
||||
interface=$2
|
||||
value=$3
|
||||
original=$(cat $dir/$interface)
|
||||
echo "testing $interface $value"
|
||||
echo $value > $dir/$interface
|
||||
new=$(cat $dir/$interface)
|
||||
[[ $value -ne $(cat $dir/$interface) ]] && {
|
||||
echo "$interface write $value failed: new:$new"
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
|
||||
[[ $(id -u) -eq 0 ]] || skip_test "Test must be run as root!"
|
||||
|
||||
# Find cpuset v1 mount point
|
||||
CPUSET=$(mount -t cgroup | grep cpuset | head -1 | awk '{print $3}')
|
||||
[[ -n "$CPUSET" ]] || skip_test "cpuset v1 mount point not found!"
|
||||
|
||||
#
|
||||
# Create a test cpuset, read write test
|
||||
#
|
||||
TDIR=test$$
|
||||
[[ -d $CPUSET/$TDIR ]] || mkdir $CPUSET/$TDIR
|
||||
|
||||
ITF_MATRIX=(
|
||||
#interface value expect root_only
|
||||
'cpuset.cpus 0-1 0-1 0'
|
||||
'cpuset.mem_exclusive 1 1 0'
|
||||
'cpuset.mem_exclusive 0 0 0'
|
||||
'cpuset.mem_hardwall 1 1 0'
|
||||
'cpuset.mem_hardwall 0 0 0'
|
||||
'cpuset.memory_migrate 1 1 0'
|
||||
'cpuset.memory_migrate 0 0 0'
|
||||
'cpuset.memory_spread_page 1 1 0'
|
||||
'cpuset.memory_spread_page 0 0 0'
|
||||
'cpuset.memory_spread_slab 1 1 0'
|
||||
'cpuset.memory_spread_slab 0 0 0'
|
||||
'cpuset.mems 0 0 0'
|
||||
'cpuset.sched_load_balance 1 1 0'
|
||||
'cpuset.sched_load_balance 0 0 0'
|
||||
'cpuset.sched_relax_domain_level 2 2 0'
|
||||
'cpuset.memory_pressure_enabled 1 1 1'
|
||||
'cpuset.memory_pressure_enabled 0 0 1'
|
||||
)
|
||||
|
||||
run_test()
|
||||
{
|
||||
cnt="${ITF_MATRIX[@]}"
|
||||
for i in "${ITF_MATRIX[@]}" ; do
|
||||
args=($i)
|
||||
root_only=${args[3]}
|
||||
[[ $root_only -eq 1 ]] && {
|
||||
write_test "$CPUSET" "${args[0]}" "${args[1]}" "${args[2]}"
|
||||
continue
|
||||
}
|
||||
write_test "$CPUSET/$TDIR" "${args[0]}" "${args[1]}" "${args[2]}"
|
||||
done
|
||||
}
|
||||
|
||||
run_test
|
||||
rmdir $CPUSET/$TDIR
|
||||
echo "Test PASSED"
|
||||
exit 0
|
|
@ -50,9 +50,35 @@ static int get_zswap_stored_pages(size_t *value)
|
|||
return read_int("/sys/kernel/debug/zswap/stored_pages", value);
|
||||
}
|
||||
|
||||
static int get_zswap_written_back_pages(size_t *value)
|
||||
static long get_cg_wb_count(const char *cg)
|
||||
{
|
||||
return read_int("/sys/kernel/debug/zswap/written_back_pages", value);
|
||||
return cg_read_key_long(cg, "memory.stat", "zswpwb");
|
||||
}
|
||||
|
||||
static long get_zswpout(const char *cgroup)
|
||||
{
|
||||
return cg_read_key_long(cgroup, "memory.stat", "zswpout ");
|
||||
}
|
||||
|
||||
static int allocate_and_read_bytes(const char *cgroup, void *arg)
|
||||
{
|
||||
size_t size = (size_t)arg;
|
||||
char *mem = (char *)malloc(size);
|
||||
int ret = 0;
|
||||
|
||||
if (!mem)
|
||||
return -1;
|
||||
for (int i = 0; i < size; i += 4095)
|
||||
mem[i] = 'a';
|
||||
|
||||
/* Go through the allocated memory to (z)swap in and out pages */
|
||||
for (int i = 0; i < size; i += 4095) {
|
||||
if (mem[i] != 'a')
|
||||
ret = -1;
|
||||
}
|
||||
|
||||
free(mem);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int allocate_bytes(const char *cgroup, void *arg)
|
||||
|
@ -68,21 +94,33 @@ static int allocate_bytes(const char *cgroup, void *arg)
|
|||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* When trying to store a memcg page in zswap, if the memcg hits its memory
|
||||
* limit in zswap, writeback should not be triggered.
|
||||
*
|
||||
* This was fixed with commit 0bdf0efa180a("zswap: do not shrink if cgroup may
|
||||
* not zswap"). Needs to be revised when a per memcg writeback mechanism is
|
||||
* implemented.
|
||||
*/
|
||||
static int test_no_invasive_cgroup_shrink(const char *root)
|
||||
static char *setup_test_group_1M(const char *root, const char *name)
|
||||
{
|
||||
size_t written_back_before, written_back_after;
|
||||
char *group_name = cg_name(root, name);
|
||||
|
||||
if (!group_name)
|
||||
return NULL;
|
||||
if (cg_create(group_name))
|
||||
goto fail;
|
||||
if (cg_write(group_name, "memory.max", "1M")) {
|
||||
cg_destroy(group_name);
|
||||
goto fail;
|
||||
}
|
||||
return group_name;
|
||||
fail:
|
||||
free(group_name);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Sanity test to check that pages are written into zswap.
|
||||
*/
|
||||
static int test_zswap_usage(const char *root)
|
||||
{
|
||||
long zswpout_before, zswpout_after;
|
||||
int ret = KSFT_FAIL;
|
||||
char *test_group;
|
||||
|
||||
/* Set up */
|
||||
test_group = cg_name(root, "no_shrink_test");
|
||||
if (!test_group)
|
||||
goto out;
|
||||
|
@ -90,26 +128,334 @@ static int test_no_invasive_cgroup_shrink(const char *root)
|
|||
goto out;
|
||||
if (cg_write(test_group, "memory.max", "1M"))
|
||||
goto out;
|
||||
if (cg_write(test_group, "memory.zswap.max", "10K"))
|
||||
|
||||
zswpout_before = get_zswpout(test_group);
|
||||
if (zswpout_before < 0) {
|
||||
ksft_print_msg("Failed to get zswpout\n");
|
||||
goto out;
|
||||
if (get_zswap_written_back_pages(&written_back_before))
|
||||
}
|
||||
|
||||
/* Allocate more than memory.max to push memory into zswap */
|
||||
if (cg_run(test_group, allocate_bytes, (void *)MB(4)))
|
||||
goto out;
|
||||
|
||||
/* Allocate 10x memory.max to push memory into zswap */
|
||||
if (cg_run(test_group, allocate_bytes, (void *)MB(10)))
|
||||
/* Verify that pages come into zswap */
|
||||
zswpout_after = get_zswpout(test_group);
|
||||
if (zswpout_after <= zswpout_before) {
|
||||
ksft_print_msg("zswpout does not increase after test program\n");
|
||||
goto out;
|
||||
}
|
||||
ret = KSFT_PASS;
|
||||
|
||||
/* Verify that no writeback happened because of the memcg allocation */
|
||||
if (get_zswap_written_back_pages(&written_back_after))
|
||||
goto out;
|
||||
if (written_back_after == written_back_before)
|
||||
ret = KSFT_PASS;
|
||||
out:
|
||||
cg_destroy(test_group);
|
||||
free(test_group);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check that when memory.zswap.max = 0, no pages can go to the zswap pool for
|
||||
* the cgroup.
|
||||
*/
|
||||
static int test_swapin_nozswap(const char *root)
|
||||
{
|
||||
int ret = KSFT_FAIL;
|
||||
char *test_group;
|
||||
long swap_peak, zswpout;
|
||||
|
||||
test_group = cg_name(root, "no_zswap_test");
|
||||
if (!test_group)
|
||||
goto out;
|
||||
if (cg_create(test_group))
|
||||
goto out;
|
||||
if (cg_write(test_group, "memory.max", "8M"))
|
||||
goto out;
|
||||
if (cg_write(test_group, "memory.zswap.max", "0"))
|
||||
goto out;
|
||||
|
||||
/* Allocate and read more than memory.max to trigger swapin */
|
||||
if (cg_run(test_group, allocate_and_read_bytes, (void *)MB(32)))
|
||||
goto out;
|
||||
|
||||
/* Verify that pages are swapped out, but no zswap happened */
|
||||
swap_peak = cg_read_long(test_group, "memory.swap.peak");
|
||||
if (swap_peak < 0) {
|
||||
ksft_print_msg("failed to get cgroup's swap_peak\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (swap_peak < MB(24)) {
|
||||
ksft_print_msg("at least 24MB of memory should be swapped out\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
zswpout = get_zswpout(test_group);
|
||||
if (zswpout < 0) {
|
||||
ksft_print_msg("failed to get zswpout\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (zswpout > 0) {
|
||||
ksft_print_msg("zswapout > 0 when memory.zswap.max = 0\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = KSFT_PASS;
|
||||
|
||||
out:
|
||||
cg_destroy(test_group);
|
||||
free(test_group);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Simple test to verify the (z)swapin code paths */
|
||||
static int test_zswapin(const char *root)
|
||||
{
|
||||
int ret = KSFT_FAIL;
|
||||
char *test_group;
|
||||
long zswpin;
|
||||
|
||||
test_group = cg_name(root, "zswapin_test");
|
||||
if (!test_group)
|
||||
goto out;
|
||||
if (cg_create(test_group))
|
||||
goto out;
|
||||
if (cg_write(test_group, "memory.max", "8M"))
|
||||
goto out;
|
||||
if (cg_write(test_group, "memory.zswap.max", "max"))
|
||||
goto out;
|
||||
|
||||
/* Allocate and read more than memory.max to trigger (z)swap in */
|
||||
if (cg_run(test_group, allocate_and_read_bytes, (void *)MB(32)))
|
||||
goto out;
|
||||
|
||||
zswpin = cg_read_key_long(test_group, "memory.stat", "zswpin ");
|
||||
if (zswpin < 0) {
|
||||
ksft_print_msg("failed to get zswpin\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (zswpin < MB(24) / PAGE_SIZE) {
|
||||
ksft_print_msg("at least 24MB should be brought back from zswap\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = KSFT_PASS;
|
||||
|
||||
out:
|
||||
cg_destroy(test_group);
|
||||
free(test_group);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Attempt writeback with the following steps:
|
||||
* 1. Allocate memory.
|
||||
* 2. Reclaim memory equal to the amount that was allocated in step 1.
|
||||
This will move it into zswap.
|
||||
* 3. Save current zswap usage.
|
||||
* 4. Move the memory allocated in step 1 back in from zswap.
|
||||
* 5. Set zswap.max to half the amount that was recorded in step 3.
|
||||
* 6. Attempt to reclaim memory equal to the amount that was allocated,
|
||||
this will either trigger writeback if it's enabled, or reclamation
|
||||
will fail if writeback is disabled as there isn't enough zswap space.
|
||||
*/
|
||||
static int attempt_writeback(const char *cgroup, void *arg)
|
||||
{
|
||||
long pagesize = sysconf(_SC_PAGESIZE);
|
||||
size_t memsize = MB(4);
|
||||
char buf[pagesize];
|
||||
long zswap_usage;
|
||||
bool wb_enabled = *(bool *) arg;
|
||||
int ret = -1;
|
||||
char *mem;
|
||||
|
||||
mem = (char *)malloc(memsize);
|
||||
if (!mem)
|
||||
return ret;
|
||||
|
||||
/*
|
||||
* Fill half of each page with increasing data, and keep other
|
||||
* half empty, this will result in data that is still compressible
|
||||
* and ends up in zswap, with material zswap usage.
|
||||
*/
|
||||
for (int i = 0; i < pagesize; i++)
|
||||
buf[i] = i < pagesize/2 ? (char) i : 0;
|
||||
|
||||
for (int i = 0; i < memsize; i += pagesize)
|
||||
memcpy(&mem[i], buf, pagesize);
|
||||
|
||||
/* Try and reclaim allocated memory */
|
||||
if (cg_write_numeric(cgroup, "memory.reclaim", memsize)) {
|
||||
ksft_print_msg("Failed to reclaim all of the requested memory\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
zswap_usage = cg_read_long(cgroup, "memory.zswap.current");
|
||||
|
||||
/* zswpin */
|
||||
for (int i = 0; i < memsize; i += pagesize) {
|
||||
if (memcmp(&mem[i], buf, pagesize)) {
|
||||
ksft_print_msg("invalid memory\n");
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
if (cg_write_numeric(cgroup, "memory.zswap.max", zswap_usage/2))
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* If writeback is enabled, trying to reclaim memory now will trigger a
|
||||
* writeback as zswap.max is half of what was needed when reclaim ran the first time.
|
||||
* If writeback is disabled, memory reclaim will fail as zswap is limited and
|
||||
* it can't writeback to swap.
|
||||
*/
|
||||
ret = cg_write_numeric(cgroup, "memory.reclaim", memsize);
|
||||
if (!wb_enabled)
|
||||
ret = (ret == -EAGAIN) ? 0 : -1;
|
||||
|
||||
out:
|
||||
free(mem);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int test_zswap_writeback_one(const char *cgroup, bool wb)
|
||||
{
|
||||
long zswpwb_before, zswpwb_after;
|
||||
|
||||
zswpwb_before = get_cg_wb_count(cgroup);
|
||||
if (zswpwb_before != 0) {
|
||||
ksft_print_msg("zswpwb_before = %ld instead of 0\n", zswpwb_before);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (cg_run(cgroup, attempt_writeback, (void *) &wb))
|
||||
return -1;
|
||||
|
||||
/* Verify that zswap writeback occurred only if writeback was enabled */
|
||||
zswpwb_after = get_cg_wb_count(cgroup);
|
||||
if (zswpwb_after < 0)
|
||||
return -1;
|
||||
|
||||
if (wb != !!zswpwb_after) {
|
||||
ksft_print_msg("zswpwb_after is %ld while wb is %s",
|
||||
zswpwb_after, wb ? "enabled" : "disabled");
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Test to verify the zswap writeback path */
|
||||
static int test_zswap_writeback(const char *root, bool wb)
|
||||
{
|
||||
int ret = KSFT_FAIL;
|
||||
char *test_group, *test_group_child = NULL;
|
||||
|
||||
if (cg_read_strcmp(root, "memory.zswap.writeback", "1"))
|
||||
return KSFT_SKIP;
|
||||
|
||||
test_group = cg_name(root, "zswap_writeback_test");
|
||||
if (!test_group)
|
||||
goto out;
|
||||
if (cg_create(test_group))
|
||||
goto out;
|
||||
if (cg_write(test_group, "memory.zswap.writeback", wb ? "1" : "0"))
|
||||
goto out;
|
||||
|
||||
if (test_zswap_writeback_one(test_group, wb))
|
||||
goto out;
|
||||
|
||||
/* Reset memory.zswap.max to max (modified by attempt_writeback), and
|
||||
* set up child cgroup, whose memory.zswap.writeback is hardcoded to 1.
|
||||
* Thus, the parent's setting shall be what's in effect. */
|
||||
if (cg_write(test_group, "memory.zswap.max", "max"))
|
||||
goto out;
|
||||
if (cg_write(test_group, "cgroup.subtree_control", "+memory"))
|
||||
goto out;
|
||||
|
||||
test_group_child = cg_name(test_group, "zswap_writeback_test_child");
|
||||
if (!test_group_child)
|
||||
goto out;
|
||||
if (cg_create(test_group_child))
|
||||
goto out;
|
||||
if (cg_write(test_group_child, "memory.zswap.writeback", "1"))
|
||||
goto out;
|
||||
|
||||
if (test_zswap_writeback_one(test_group_child, wb))
|
||||
goto out;
|
||||
|
||||
ret = KSFT_PASS;
|
||||
|
||||
out:
|
||||
if (test_group_child) {
|
||||
cg_destroy(test_group_child);
|
||||
free(test_group_child);
|
||||
}
|
||||
cg_destroy(test_group);
|
||||
free(test_group);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int test_zswap_writeback_enabled(const char *root)
|
||||
{
|
||||
return test_zswap_writeback(root, true);
|
||||
}
|
||||
|
||||
static int test_zswap_writeback_disabled(const char *root)
|
||||
{
|
||||
return test_zswap_writeback(root, false);
|
||||
}
|
||||
|
||||
/*
|
||||
* When trying to store a memcg page in zswap, if the memcg hits its memory
|
||||
* limit in zswap, writeback should affect only the zswapped pages of that
|
||||
* memcg.
|
||||
*/
|
||||
static int test_no_invasive_cgroup_shrink(const char *root)
|
||||
{
|
||||
int ret = KSFT_FAIL;
|
||||
size_t control_allocation_size = MB(10);
|
||||
char *control_allocation, *wb_group = NULL, *control_group = NULL;
|
||||
|
||||
wb_group = setup_test_group_1M(root, "per_memcg_wb_test1");
|
||||
if (!wb_group)
|
||||
return KSFT_FAIL;
|
||||
if (cg_write(wb_group, "memory.zswap.max", "10K"))
|
||||
goto out;
|
||||
control_group = setup_test_group_1M(root, "per_memcg_wb_test2");
|
||||
if (!control_group)
|
||||
goto out;
|
||||
|
||||
/* Push some test_group2 memory into zswap */
|
||||
if (cg_enter_current(control_group))
|
||||
goto out;
|
||||
control_allocation = malloc(control_allocation_size);
|
||||
for (int i = 0; i < control_allocation_size; i += 4095)
|
||||
control_allocation[i] = 'a';
|
||||
if (cg_read_key_long(control_group, "memory.stat", "zswapped") < 1)
|
||||
goto out;
|
||||
|
||||
/* Allocate 10x memory.max to push wb_group memory into zswap and trigger wb */
|
||||
if (cg_run(wb_group, allocate_bytes, (void *)MB(10)))
|
||||
goto out;
|
||||
|
||||
/* Verify that only zswapped memory from gwb_group has been written back */
|
||||
if (get_cg_wb_count(wb_group) > 0 && get_cg_wb_count(control_group) == 0)
|
||||
ret = KSFT_PASS;
|
||||
out:
|
||||
cg_enter_current(root);
|
||||
if (control_group) {
|
||||
cg_destroy(control_group);
|
||||
free(control_group);
|
||||
}
|
||||
cg_destroy(wb_group);
|
||||
free(wb_group);
|
||||
if (control_allocation)
|
||||
free(control_allocation);
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct no_kmem_bypass_child_args {
|
||||
size_t target_alloc_bytes;
|
||||
size_t child_allocated;
|
||||
|
@ -177,8 +523,6 @@ static int test_no_kmem_bypass(const char *root)
|
|||
trigger_allocation_size = sys_info.totalram / 20;
|
||||
|
||||
/* Set up test memcg */
|
||||
if (cg_write(root, "cgroup.subtree_control", "+memory"))
|
||||
goto out;
|
||||
test_group = cg_name(root, "kmem_bypass_test");
|
||||
if (!test_group)
|
||||
goto out;
|
||||
|
@ -235,6 +579,11 @@ struct zswap_test {
|
|||
int (*fn)(const char *root);
|
||||
const char *name;
|
||||
} tests[] = {
|
||||
T(test_zswap_usage),
|
||||
T(test_swapin_nozswap),
|
||||
T(test_zswapin),
|
||||
T(test_zswap_writeback_enabled),
|
||||
T(test_zswap_writeback_disabled),
|
||||
T(test_no_kmem_bypass),
|
||||
T(test_no_invasive_cgroup_shrink),
|
||||
};
|
||||
|
|
Loading…
Reference in New Issue