From 34d6837d6c0befec9fc506588340cd727dbd7b9a Mon Sep 17 00:00:00 2001 From: Kaihao Bai Date: Thu, 7 May 2026 11:41:35 +0800 Subject: [PATCH] anolis: mm: support pre oom ANBZ: #9079 Provide a universal and reliable rapid OOM resolution solution to assist businesses in enhancing memory deployment density and improving the stability of online business performance during high watermark operation. The fast oom (pre_oom) feature monitors memory pressure and triggers OOM killer earlier than the standard kernel OOM path, preventing system instability under extreme memory pressure. Key features: - New sysctl interface for global fast oom control (vm.pre_oom_level) - Per-memcg fast oom control via cgroup interface - Avoids unnecessary direct reclaim sleep for non-reclaimable allocations - Restricts fast oom to only affect direct memory reclaim path (not memcg-level reclaim or kswapd) to avoid potential deadlocks Signed-off-by: Kaihao Bai --- .../L1-RECOMMEND/default/CONFIG_PRE_OOM | 1 + include/linux/memcontrol.h | 3 + include/linux/pre_oom.h | 36 ++++ include/linux/sched.h | 3 + kernel/exit.c | 13 +- mm/Kconfig | 7 + mm/Makefile | 1 + mm/memcontrol-v1.c | 7 + mm/memcontrol-v1.h | 5 + mm/memcontrol.c | 28 +++ mm/page_alloc.c | 31 ++++ mm/pre_oom.c | 161 ++++++++++++++++++ 12 files changed, 294 insertions(+), 2 deletions(-) create mode 100644 anolis/configs/L1-RECOMMEND/default/CONFIG_PRE_OOM create mode 100644 include/linux/pre_oom.h create mode 100644 mm/pre_oom.c diff --git a/anolis/configs/L1-RECOMMEND/default/CONFIG_PRE_OOM b/anolis/configs/L1-RECOMMEND/default/CONFIG_PRE_OOM new file mode 100644 index 000000000000..0e3b36104118 --- /dev/null +++ b/anolis/configs/L1-RECOMMEND/default/CONFIG_PRE_OOM @@ -0,0 +1 @@ +CONFIG_PRE_OOM=y diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a998b8743cb9..61f19fcd375b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -391,6 +391,9 @@ struct mem_cgroup { int num_oom_skip; struct mem_cgroup *next_reset; +#ifdef CONFIG_PRE_OOM + bool pre_oom; +#endif CK_KABI_RESERVE(1) CK_KABI_RESERVE(2) CK_KABI_RESERVE(3) diff --git a/include/linux/pre_oom.h b/include/linux/pre_oom.h new file mode 100644 index 000000000000..1fbd4fb3f9cd --- /dev/null +++ b/include/linux/pre_oom.h @@ -0,0 +1,36 @@ +#ifndef _LINUX_PRE_OOM_H +#define _LINUX_PRE_OOM_H + +#include + +#ifdef CONFIG_PRE_OOM + +#include +#include + +DECLARE_STATIC_KEY_FALSE(pre_oom_enabled_key); +static inline bool pre_oom_enabled(void) +{ + return static_branch_unlikely(&pre_oom_enabled_key); +} + + +int pre_oom_enter(void); +void pre_oom_leave(void); + +#else + +static inline bool pre_oom_enabled(void) +{ + return false; +} + +static inline int pre_oom_enter(void) +{ + return 0; +} + +static inline void pre_oom_leave(void) {} + +#endif /* CONFIG_PRE_OOM */ +#endif /* _LINUX_PRE_OOM_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 3614e5f20d91..b637a2dc6e0e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1090,6 +1090,9 @@ struct task_struct { unsigned in_thrashing:1; #endif unsigned in_nf_duplicate:1; +#ifdef CONFIG_PRE_OOM + unsigned reclaim_stall:1; +#endif #ifdef CONFIG_PREEMPT_RT struct netdev_xmit net_xmit; #endif diff --git a/kernel/exit.c b/kernel/exit.c index 25e9cb6de7e7..3ac6b7b7e5f5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -71,7 +71,9 @@ #include #include #include - +#ifdef CONFIG_PRE_OOM +#include +#endif #include #include @@ -1006,7 +1008,14 @@ void __noreturn do_exit(long code) put_page(tsk->task_frag.page); exit_task_stack_account(tsk); - +#ifdef CONFIG_PRE_OOM + /* + * Killed task has been stalled in reclaim path, release the semaphore + * here. + */ + if (unlikely(tsk->reclaim_stall)) + pre_oom_leave(); +#endif check_stack_usage(); preempt_disable(); if (tsk->nr_dirtied) diff --git a/mm/Kconfig b/mm/Kconfig index ef0c2b2a9487..56b90bff84a2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1482,6 +1482,13 @@ config LAZY_MMU_MODE_KUNIT_TEST If unsure, say N. +config PRE_OOM + bool "Enable pre oom control" + depends on MEMCG + help + This feature is used to ensure that higher priority tasks would not enter the direct + reclaim path when applying for memory allocation. + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index 8ad2ab08244e..581a396bb7b3 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -150,3 +150,4 @@ obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o obj-$(CONFIG_EXECMEM) += execmem.o obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o obj-$(CONFIG_LAZY_MMU_MODE_KUNIT_TEST) += tests/lazy_mmu_mode_kunit.o +obj-$(CONFIG_PRE_OOM) += pre_oom.o diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 0f44459d3495..79b87c1450f9 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -2371,6 +2371,13 @@ struct cftype mem_cgroup_legacy_files[] = { .seq_show = memcg_lru_gen_show, .write = memcg_lru_gen_write, }, +#endif +#ifdef CONFIG_PRE_OOM + { + .name = "pre_oom", + .write_u64 = memcg_pre_oom_write, + .read_u64 = memcg_pre_oom_read, + }, #endif { }, /* terminate */ }; diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h index a211f7009c17..b0f363cc0d35 100644 --- a/mm/memcontrol-v1.h +++ b/mm/memcontrol-v1.h @@ -26,6 +26,11 @@ void drain_all_stock(struct mem_cgroup *root_memcg); unsigned long memcg_events(struct mem_cgroup *memcg, int event); int memory_stat_show(struct seq_file *m, void *v); +#ifdef CONFIG_PRE_OOM +u64 memcg_pre_oom_read(struct cgroup_subsys_state *css, struct cftype *cft); +int memcg_pre_oom_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val); +#endif /* CONFIG_PRE_OOM */ struct mem_cgroup *mem_cgroup_private_id_get_online(struct mem_cgroup *memcg, unsigned int n); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9bcd2b1334c7..930257220428 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5385,6 +5385,27 @@ ssize_t memcg_lru_gen_write(struct kernfs_open_file *of, return lru_gen_memcg_write(of, buf, nbytes, off); } #endif +#ifdef CONFIG_PRE_OOM +u64 memcg_pre_oom_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return READ_ONCE(memcg->pre_oom); +} + +int memcg_pre_oom_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + if (val) + WRITE_ONCE(memcg->pre_oom, 1); + else + WRITE_ONCE(memcg->pre_oom, 0); + + return 0; +} +#endif /* CONFIG_PRE_OOM */ static struct cftype memory_files[] = { { @@ -5544,6 +5565,13 @@ static struct cftype memory_files[] = { .seq_show = memcg_lru_gen_show, .write = memcg_lru_gen_write, }, +#endif +#ifdef CONFIG_PRE_OOM + { + .name = "pre_oom", + .write_u64 = memcg_pre_oom_write, + .read_u64 = memcg_pre_oom_read, + }, #endif { } /* terminate */ }; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ae5da1fafb6f..3d06392de9cc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include "internal.h" #include "shuffle.h" @@ -4398,6 +4399,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, cond_resched(); /* We now go into synchronous reclaim */ + pre_oom_enter(); cpuset_memory_pressure_bump(); memcg_lat_stat_start(&start); fs_reclaim_acquire(gfp_mask); @@ -4409,6 +4411,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(gfp_mask); memcg_lat_stat_end(MEM_LAT_GLOBAL_DIRECT_RECLAIM, start); + pre_oom_leave(); cond_resched(); @@ -4709,6 +4712,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, int reserve_flags; bool compact_first = false; bool can_retry_reserves = true; + bool can_pre_oom = false; if (unlikely(nofail)) { /* @@ -4724,6 +4728,28 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, WARN_ON_ONCE(current->flags & PF_MEMALLOC); } +#ifdef CONFIG_PRE_OOM + /* + * If Pre-OOM is enabled, the cgroup of QoS sensitive should avoid + * direct reclaim and trigger OOM as soon as possible. Thus gfp_mask + * should be reset here. + */ + if (pre_oom_enabled()) { + struct mem_cgroup *memcg; + + memcg = get_mem_cgroup_from_mm(current->mm); + if (memcg) { + if (memcg->pre_oom) + gfp_mask &= ~__GFP_DIRECT_RECLAIM; + css_put(&memcg->css); + } + + can_pre_oom = !(gfp_mask & __GFP_DIRECT_RECLAIM); + } + +#endif + + restart: compaction_retries = 0; no_progress_loops = 0; @@ -4820,6 +4846,10 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, if (!can_direct_reclaim) goto nopage; + /* Caller is going to oom, skip direct compact and reclaim */ + if (can_pre_oom) + goto oom; + /* Avoid recursion of direct reclaim */ if (current->flags & PF_MEMALLOC) goto nopage; @@ -4916,6 +4946,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, check_retry_zonelist(zonelist_iter_cookie)) goto restart; +oom: /* Reclaim has failed us, start killing things */ page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); if (page) diff --git a/mm/pre_oom.c b/mm/pre_oom.c new file mode 100644 index 000000000000..aa579bd2ea46 --- /dev/null +++ b/mm/pre_oom.c @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +DEFINE_STATIC_KEY_FALSE(pre_oom_enabled_key); + +/* + * From 0 .. 3, which means the kernel can support up to + * num_online_cpus / oom_level tasks to reclaim memory. + */ +static int oom_level; +static struct semaphore *sem; + +int pre_oom_enter(void) +{ + int result; + + if (!pre_oom_enabled()) + return 0; + + result = down_killable(sem); + if (!result) + current->reclaim_stall = 1; + + return result; +} + +void pre_oom_leave(void) +{ + if (current->reclaim_stall) { + current->reclaim_stall = 0; + up(sem); + } +} + +static int adjust_oom_level(int level) +{ + unsigned long flags; + int count = num_online_cpus() / (level + 1); + int result = 0; + + raw_spin_lock_irqsave(&sem->lock, flags); + + /* There are no other tasks reclaiming memory */ + if (sem->count == (num_online_cpus() / (oom_level + 1))) { + sem->count = count; + oom_level = level; + } else + result = -EPERM; + + raw_spin_unlock_irqrestore(&sem->lock, flags); + + return result; +} + +#ifdef CONFIG_SYSFS +static ssize_t pre_oom_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", !!static_branch_unlikely(&pre_oom_enabled_key)); +} + +static ssize_t pre_oom_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + static DEFINE_MUTEX(mutex); + ssize_t ret = count; + + mutex_lock(&mutex); + + if (!strncmp(buf, "1", 1)) + static_branch_enable(&pre_oom_enabled_key); + else if (!strncmp(buf, "0", 1)) + static_branch_disable(&pre_oom_enabled_key); + else + ret = -EINVAL; + + mutex_unlock(&mutex); + return ret; +} + +static ssize_t pre_oom_level_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", oom_level); +} + +static ssize_t pre_oom_level_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + static DEFINE_MUTEX(mutex); + ssize_t ret = count; + unsigned long level; + + ret = kstrtoul(buf, 10, &level); + if (ret) + return ret; + + mutex_lock(&mutex); + if (level > 3) { + mutex_unlock(&mutex); + return -EINVAL; + } + + ret = adjust_oom_level(level); + mutex_unlock(&mutex); + + return ret; + +} + +static struct kobj_attribute pre_oom_enabled_attr = + __ATTR(enabled, 0644, pre_oom_enabled_show, + pre_oom_enabled_store); + +static struct kobj_attribute pre_oom_level_attr = + __ATTR(level, 0644, pre_oom_level_show, + pre_oom_level_store); + +static struct attribute *pre_oom_attrs[] = { + &pre_oom_enabled_attr.attr, + &pre_oom_level_attr.attr, + NULL, +}; + +static const struct attribute_group pre_oom_attr_group = { + .attrs = pre_oom_attrs, + .name = "pre_oom", +}; +#endif /* CONFIG_SYSFS */ + +static int __init pre_oom_init(void) +{ +#ifdef CONFIG_SYSFS + int err; + + err = sysfs_create_group(mm_kobj, &pre_oom_attr_group); + if (err) { + pr_err("pre_oom: register sysfs failed\n"); + return err; + } +#endif + sem = kmalloc(sizeof(*sem), GFP_KERNEL); + if (!sem) { + sysfs_remove_group(mm_kobj, &pre_oom_attr_group); + return -ENOMEM; + } + + sema_init(sem, num_online_cpus()); + + return 0; +} +subsys_initcall(pre_oom_init); -- Gitee