diff --git a/anolis/configs/L1-RECOMMEND/default/CONFIG_PRE_OOM b/anolis/configs/L1-RECOMMEND/default/CONFIG_PRE_OOM new file mode 100644 index 0000000000000000000000000000000000000000..0e3b36104118d0840614411f4c29a561c566c787 --- /dev/null +++ b/anolis/configs/L1-RECOMMEND/default/CONFIG_PRE_OOM @@ -0,0 +1 @@ +CONFIG_PRE_OOM=y diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 8951111c8418ed4b5da81fb663f3479c2e5a0ec8..4b2803605a87366f4804c02aba3fef91233bc9cf 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -374,6 +374,9 @@ struct mem_cgroup { int num_oom_skip; struct mem_cgroup *next_reset; +#ifdef CONFIG_PRE_OOM + bool pre_oom; +#endif CK_KABI_RESERVE(1) CK_KABI_RESERVE(2) CK_KABI_RESERVE(3) diff --git a/include/linux/pre_oom.h b/include/linux/pre_oom.h new file mode 100644 index 0000000000000000000000000000000000000000..1fbd4fb3f9cd9f7f9385d3901fb5ec771d0a4fa8 --- /dev/null +++ b/include/linux/pre_oom.h @@ -0,0 +1,36 @@ +#ifndef _LINUX_PRE_OOM_H +#define _LINUX_PRE_OOM_H + +#include + +#ifdef CONFIG_PRE_OOM + +#include +#include + +DECLARE_STATIC_KEY_FALSE(pre_oom_enabled_key); +static inline bool pre_oom_enabled(void) +{ + return static_branch_unlikely(&pre_oom_enabled_key); +} + + +int pre_oom_enter(void); +void pre_oom_leave(void); + +#else + +static inline bool pre_oom_enabled(void) +{ + return false; +} + +static inline int pre_oom_enter(void) +{ + return 0; +} + +static inline void pre_oom_leave(void) {} + +#endif /* CONFIG_PRE_OOM */ +#endif /* _LINUX_PRE_OOM_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 3614e5f20d91cd1520c121ad70b7d7dfd40e7a7f..b637a2dc6e0e33aa6ac2f9317bce6d2ec0bfd5ea 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1090,6 +1090,9 @@ struct task_struct { unsigned in_thrashing:1; #endif unsigned in_nf_duplicate:1; +#ifdef CONFIG_PRE_OOM + unsigned reclaim_stall:1; +#endif #ifdef CONFIG_PREEMPT_RT struct netdev_xmit net_xmit; #endif diff --git a/kernel/exit.c b/kernel/exit.c index 25e9cb6de7e7933596921e7d5030132424373f69..3ac6b7b7e5f57f86c50119750b6acb487015dceb 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -71,7 +71,9 @@ #include #include #include - +#ifdef CONFIG_PRE_OOM +#include +#endif #include #include @@ -1006,7 +1008,14 @@ void __noreturn do_exit(long code) put_page(tsk->task_frag.page); exit_task_stack_account(tsk); - +#ifdef CONFIG_PRE_OOM + /* + * Killed task has been stalled in reclaim path, release the semaphore + * here. + */ + if (unlikely(tsk->reclaim_stall)) + pre_oom_leave(); +#endif check_stack_usage(); preempt_disable(); if (tsk->nr_dirtied) diff --git a/mm/Kconfig b/mm/Kconfig index a053982ff4b2f08374b1db198443e044eee81e92..562e6aeb7b4efdec68903675e33bae993d5acdb1 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1479,6 +1479,13 @@ config LAZY_MMU_MODE_KUNIT_TEST If unsure, say N. +config PRE_OOM + bool "Enable pre oom control" + depends on MEMCG + help + This feature is used to ensure that higher priority tasks would not enter the direct + reclaim path when applying for memory allocation. + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index 8ad2ab08244ebe5699914a7df4235d11fd2f62bd..581a396bb7b3153522a5e66ba588fce345d8530e 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -150,3 +150,4 @@ obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o obj-$(CONFIG_EXECMEM) += execmem.o obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o obj-$(CONFIG_LAZY_MMU_MODE_KUNIT_TEST) += tests/lazy_mmu_mode_kunit.o +obj-$(CONFIG_PRE_OOM) += pre_oom.o diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index fb072008ea4eb4c6683dac753f50d710abf0c94f..97a9a9474e5741b9b7b5fb92470f105b05d4e97f 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -2324,6 +2324,13 @@ struct cftype mem_cgroup_legacy_files[] = { .seq_show = memory_oom_group_show, .write = memory_oom_group_write, }, +#ifdef CONFIG_PRE_OOM + { + .name = "pre_oom", + .write_u64 = memcg_pre_oom_write, + .read_u64 = memcg_pre_oom_read, + }, +#endif { }, /* terminate */ }; diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h index f92f81108d5ed20a5b8d04a12e90143b6a675c95..eaed59fbbbcbb3d6104a5fa2c7158a240ef7ec14 100644 --- a/mm/memcontrol-v1.h +++ b/mm/memcontrol-v1.h @@ -26,6 +26,11 @@ void drain_all_stock(struct mem_cgroup *root_memcg); unsigned long memcg_events(struct mem_cgroup *memcg, int event); int memory_stat_show(struct seq_file *m, void *v); +#ifdef CONFIG_PRE_OOM +u64 memcg_pre_oom_read(struct cgroup_subsys_state *css, struct cftype *cft); +int memcg_pre_oom_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val); +#endif /* CONFIG_PRE_OOM */ struct mem_cgroup *mem_cgroup_private_id_get_online(struct mem_cgroup *memcg, unsigned int n); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index eb6df08089ae9c87456d11a05008dc90d469fb5d..e73c2c43c68248a10bc21fcc86ba63340d422ebd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5112,6 +5112,28 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, return nbytes; } +#ifdef CONFIG_PRE_OOM +u64 memcg_pre_oom_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return READ_ONCE(memcg->pre_oom); +} + +int memcg_pre_oom_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + if (val) + WRITE_ONCE(memcg->pre_oom, 1); + else + WRITE_ONCE(memcg->pre_oom, 0); + + return 0; +} +#endif /* CONFIG_PRE_OOM */ + static struct cftype memory_files[] = { { .name = "current", @@ -5238,6 +5260,13 @@ static struct cftype memory_files[] = { .flags = CFTYPE_NS_DELEGATABLE, .write = memory_reclaim, }, +#ifdef CONFIG_PRE_OOM + { + .name = "pre_oom", + .write_u64 = memcg_pre_oom_write, + .read_u64 = memcg_pre_oom_read, + }, +#endif { } /* terminate */ }; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ae5da1fafb6f3d2c1e2de0cdf8de98e417074ac8..3d06392de9cc8b13e0322eb3e38237400b2e78cc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include "internal.h" #include "shuffle.h" @@ -4398,6 +4399,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, cond_resched(); /* We now go into synchronous reclaim */ + pre_oom_enter(); cpuset_memory_pressure_bump(); memcg_lat_stat_start(&start); fs_reclaim_acquire(gfp_mask); @@ -4409,6 +4411,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(gfp_mask); memcg_lat_stat_end(MEM_LAT_GLOBAL_DIRECT_RECLAIM, start); + pre_oom_leave(); cond_resched(); @@ -4709,6 +4712,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, int reserve_flags; bool compact_first = false; bool can_retry_reserves = true; + bool can_pre_oom = false; if (unlikely(nofail)) { /* @@ -4724,6 +4728,28 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, WARN_ON_ONCE(current->flags & PF_MEMALLOC); } +#ifdef CONFIG_PRE_OOM + /* + * If Pre-OOM is enabled, the cgroup of QoS sensitive should avoid + * direct reclaim and trigger OOM as soon as possible. Thus gfp_mask + * should be reset here. + */ + if (pre_oom_enabled()) { + struct mem_cgroup *memcg; + + memcg = get_mem_cgroup_from_mm(current->mm); + if (memcg) { + if (memcg->pre_oom) + gfp_mask &= ~__GFP_DIRECT_RECLAIM; + css_put(&memcg->css); + } + + can_pre_oom = !(gfp_mask & __GFP_DIRECT_RECLAIM); + } + +#endif + + restart: compaction_retries = 0; no_progress_loops = 0; @@ -4820,6 +4846,10 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, if (!can_direct_reclaim) goto nopage; + /* Caller is going to oom, skip direct compact and reclaim */ + if (can_pre_oom) + goto oom; + /* Avoid recursion of direct reclaim */ if (current->flags & PF_MEMALLOC) goto nopage; @@ -4916,6 +4946,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, check_retry_zonelist(zonelist_iter_cookie)) goto restart; +oom: /* Reclaim has failed us, start killing things */ page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); if (page) diff --git a/mm/pre_oom.c b/mm/pre_oom.c new file mode 100644 index 0000000000000000000000000000000000000000..aa579bd2ea46a84bb4b026ce72b032fbd97864e6 --- /dev/null +++ b/mm/pre_oom.c @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +DEFINE_STATIC_KEY_FALSE(pre_oom_enabled_key); + +/* + * From 0 .. 3, which means the kernel can support up to + * num_online_cpus / oom_level tasks to reclaim memory. + */ +static int oom_level; +static struct semaphore *sem; + +int pre_oom_enter(void) +{ + int result; + + if (!pre_oom_enabled()) + return 0; + + result = down_killable(sem); + if (!result) + current->reclaim_stall = 1; + + return result; +} + +void pre_oom_leave(void) +{ + if (current->reclaim_stall) { + current->reclaim_stall = 0; + up(sem); + } +} + +static int adjust_oom_level(int level) +{ + unsigned long flags; + int count = num_online_cpus() / (level + 1); + int result = 0; + + raw_spin_lock_irqsave(&sem->lock, flags); + + /* There are no other tasks reclaiming memory */ + if (sem->count == (num_online_cpus() / (oom_level + 1))) { + sem->count = count; + oom_level = level; + } else + result = -EPERM; + + raw_spin_unlock_irqrestore(&sem->lock, flags); + + return result; +} + +#ifdef CONFIG_SYSFS +static ssize_t pre_oom_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", !!static_branch_unlikely(&pre_oom_enabled_key)); +} + +static ssize_t pre_oom_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + static DEFINE_MUTEX(mutex); + ssize_t ret = count; + + mutex_lock(&mutex); + + if (!strncmp(buf, "1", 1)) + static_branch_enable(&pre_oom_enabled_key); + else if (!strncmp(buf, "0", 1)) + static_branch_disable(&pre_oom_enabled_key); + else + ret = -EINVAL; + + mutex_unlock(&mutex); + return ret; +} + +static ssize_t pre_oom_level_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", oom_level); +} + +static ssize_t pre_oom_level_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + static DEFINE_MUTEX(mutex); + ssize_t ret = count; + unsigned long level; + + ret = kstrtoul(buf, 10, &level); + if (ret) + return ret; + + mutex_lock(&mutex); + if (level > 3) { + mutex_unlock(&mutex); + return -EINVAL; + } + + ret = adjust_oom_level(level); + mutex_unlock(&mutex); + + return ret; + +} + +static struct kobj_attribute pre_oom_enabled_attr = + __ATTR(enabled, 0644, pre_oom_enabled_show, + pre_oom_enabled_store); + +static struct kobj_attribute pre_oom_level_attr = + __ATTR(level, 0644, pre_oom_level_show, + pre_oom_level_store); + +static struct attribute *pre_oom_attrs[] = { + &pre_oom_enabled_attr.attr, + &pre_oom_level_attr.attr, + NULL, +}; + +static const struct attribute_group pre_oom_attr_group = { + .attrs = pre_oom_attrs, + .name = "pre_oom", +}; +#endif /* CONFIG_SYSFS */ + +static int __init pre_oom_init(void) +{ +#ifdef CONFIG_SYSFS + int err; + + err = sysfs_create_group(mm_kobj, &pre_oom_attr_group); + if (err) { + pr_err("pre_oom: register sysfs failed\n"); + return err; + } +#endif + sem = kmalloc(sizeof(*sem), GFP_KERNEL); + if (!sem) { + sysfs_remove_group(mm_kobj, &pre_oom_attr_group); + return -ENOMEM; + } + + sema_init(sem, num_online_cpus()); + + return 0; +} +subsys_initcall(pre_oom_init);