From 7c9293472a7177821446d4c65b8d8e049fc14e8a Mon Sep 17 00:00:00 2001 From: zhongjiang-ali Date: Fri, 8 Apr 2022 00:24:13 +0800 Subject: [PATCH 1/6] anolis: mm: introduce ability to reserve page cache on system wide ANBZ: #12338 ANBZ: #849 The kernel generally prefers to reclaim page cache over anonymous pages, and only reclaims page cache when there is no swap configured. However, in some extreme scenario, where the application allocates a large amount of anonymous memory, the page cache is almost completely exhausted, while the OOM killer barely fires. The system can suffer from heavy IO, and the application performance can be significantly affected, or even the application may become unresponsive, since the page cache, including the application program instruction, is thrashing. In such scenario, some users do want OOM instead of half-dead. This provides user the ability to reserve page cache on system wide. With appropriate amount of page cache reserved, OOM killer can be triggered in time, and some key processes can make progress (cooperate with oom_score_adj, for example). Enable the feature with: echo XXX > /proc/sys/vm/min_cache_kbytes disable the feature with: echo 0 > /proc/sys/vm/min_cache_kbytes Signed-off-by: zhongjiang-ali Acked-by: Gang Deng Suggested-by: yinbinbin Reviewed-by: Xu Yu Signed-off-by: Weilin Tong Reviewed-by: Baolin Wang Link: https://gitee.com/anolis/cloud-kernel/pulls/4219 Signed-off-by: Kaihao Bai --- include/linux/mmzone.h | 1 + include/linux/swap.h | 1 + mm/page_alloc.c | 53 ++++++++++++++++++++++++++++++++++++++++++ mm/vmscan.c | 21 +++++++++++++++++ 4 files changed, 76 insertions(+) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 03c5cd62dcb8..504c202cc455 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1552,6 +1552,7 @@ typedef struct pglist_data { */ unsigned long totalreserve_pages; + unsigned long min_cache_pages; #ifdef CONFIG_NUMA /* * node reclaim becomes active if more unmapped pages exist. diff --git a/include/linux/swap.h b/include/linux/swap.h index 50663eec7b18..59f37e86872d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -311,6 +311,7 @@ void workingset_activation(struct folio *folio); /* linux/mm/page_alloc.c */ extern unsigned long totalreserve_pages; +extern unsigned long sysctl_min_cache_kbytes; /* Definition of global_zone_page_state not available yet */ #define nr_free_pages() global_zone_page_state(NR_FREE_PAGES) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ae5da1fafb6f..16bba674706f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6557,6 +6557,51 @@ static int watermark_scale_factor_sysctl_handler(const struct ctl_table *table, return 0; } +static void setup_min_cache_kbytes(void) +{ + pg_data_t *pgdat; + struct zone *zone; + unsigned long lowmem_pages = 0; + unsigned long min_cache_pages = sysctl_min_cache_kbytes >> (PAGE_SHIFT - 10); + + for_each_online_pgdat(pgdat) + pgdat->min_cache_pages = 0; + + for_each_zone(zone) { + if (!is_highmem(zone)) + lowmem_pages += zone_managed_pages(zone); + } + + for_each_zone(zone) { + u64 tmp; + + /* + * Make sure that lowmem zone reserve a mount of file pages + * to avoid thrashing. highmem zone is allowed to eat up + * memory as soon as possible. + */ + if (!is_highmem(zone)) { + tmp = zone_managed_pages(zone) * min_cache_pages; + do_div(tmp, lowmem_pages); + zone->zone_pgdat->min_cache_pages += tmp; + } + } +} + +static int sysctl_min_cache_kbytes_sysctl_handler(const struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int rc; + + rc = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + setup_min_cache_kbytes(); + + return 0; +} + #ifdef CONFIG_NUMA static void setup_min_unmapped_ratio(void) { @@ -6735,6 +6780,14 @@ static const struct ctl_table page_alloc_sysctl_table[] = { .mode = 0644, .proc_handler = lowmem_reserve_ratio_sysctl_handler, }, + { + .procname = "min_cache_kbytes", + .data = &sysctl_min_cache_kbytes, + .maxlen = sizeof(sysctl_min_cache_kbytes), + .mode = 0644, + .proc_handler = sysctl_min_cache_kbytes_sysctl_handler, + .extra1 = SYSCTL_LONG_ZERO, + }, #ifdef CONFIG_NUMA { .procname = "numa_zonelist_order", diff --git a/mm/vmscan.c b/mm/vmscan.c index 2f7330b43139..ff10ee14b5eb 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -149,6 +149,9 @@ struct scan_control { /* Always discard instead of demoting to lower tier memory */ unsigned int no_demotion:1; + /* The file pages on the current node are not allowed to reclaim */ + unsigned int file_is_reserved:1; + /* Allocation order */ s8 order; @@ -197,6 +200,8 @@ struct scan_control { * From 0 .. MAX_SWAPPINESS. Higher means more swappy. */ int vm_swappiness = 60; +/* The min page cache should be reserved in the system */ +unsigned long sysctl_min_cache_kbytes; #ifdef CONFIG_MEMCG @@ -2367,7 +2372,9 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) */ if (!cgroup_reclaim(sc)) { unsigned long total_high_wmark = 0; + unsigned long total_min_wmark = 0; unsigned long free, anon; + unsigned long min_cache_kbytes; int z; struct zone *zone; @@ -2377,6 +2384,7 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) for_each_managed_zone_pgdat(zone, pgdat, z, MAX_NR_ZONES - 1) { total_high_wmark += high_wmark_pages(zone); + total_min_wmark += min_wmark_pages(zone); } /* @@ -2390,6 +2398,17 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) file + free <= total_high_wmark && !(sc->may_deactivate & DEACTIVATE_ANON) && anon >> sc->priority; + + /* + * Reserve a specified amount of page caches in case of thrashing. + * OOM killer is preferred when the system page cache is below the + * given watermark. + */ + min_cache_kbytes = READ_ONCE(sysctl_min_cache_kbytes); + if (min_cache_kbytes) { + sc->file_is_reserved = (sc->may_deactivate & DEACTIVATE_FILE) && + file <= min(total_min_wmark, pgdat->min_cache_pages); + } } } @@ -2605,6 +2624,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, /* Scan one type exclusively */ if ((scan_balance == SCAN_FILE) != file) scan = 0; + else if (sc->file_is_reserved && file) + scan = 0; break; default: /* Look ma, no brain */ -- Gitee From 83be902bbe5a6ff87648dce52fa2b5e554a1bf75 Mon Sep 17 00:00:00 2001 From: Xu Yu Date: Sat, 22 Jul 2023 12:25:08 +0800 Subject: [PATCH 2/6] anolis: mm: make min_cache_kbytes behave explicitly ANBZ: #12338 ANBZ: #6025 The page cache reserve feature, i.e., min_cache_kbytes, does not take effect in some scenarios. This is because that min_cache_kbytes is subject to sc->may_deactivate and min_wmark. This makes min_cache_kbytes behave explicitly, i.e., eliminating additional restrictions such as sc->may_deactivate and min_wmark. Fixes: 1f86172dbe16 ("anolis: mm: introduce ability to reserve page cache on system wide") Signed-off-by: Xu Yu Reviewed-by: zhongjiang-ali Link: https://gitee.com/anolis/cloud-kernel/pulls/1968 Signed-off-by: Weilin Tong Reviewed-by: Baolin Wang Link: https://gitee.com/anolis/cloud-kernel/pulls/4219 Signed-off-by: Kaihao Bai --- mm/vmscan.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index ff10ee14b5eb..336429255016 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2372,7 +2372,6 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) */ if (!cgroup_reclaim(sc)) { unsigned long total_high_wmark = 0; - unsigned long total_min_wmark = 0; unsigned long free, anon; unsigned long min_cache_kbytes; int z; @@ -2384,7 +2383,6 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) for_each_managed_zone_pgdat(zone, pgdat, z, MAX_NR_ZONES - 1) { total_high_wmark += high_wmark_pages(zone); - total_min_wmark += min_wmark_pages(zone); } /* @@ -2405,10 +2403,8 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) * given watermark. */ min_cache_kbytes = READ_ONCE(sysctl_min_cache_kbytes); - if (min_cache_kbytes) { - sc->file_is_reserved = (sc->may_deactivate & DEACTIVATE_FILE) && - file <= min(total_min_wmark, pgdat->min_cache_pages); - } + if (min_cache_kbytes) + sc->file_is_reserved = file <= pgdat->min_cache_pages; } } -- Gitee From 4b2dd26954f2e0c906434d3a1503f0bdaa07ea7b Mon Sep 17 00:00:00 2001 From: Xu Yu Date: Sat, 22 Jul 2023 15:04:38 +0800 Subject: [PATCH 3/6] anolis: mm: limit min_cache_kbytes ANBZ: #12338 ANBZ: #6025 This limits min_cache_kbytes to half of total system memory at most. Signed-off-by: Xu Yu Reviewed-by: zhongjiang-ali Link: https://gitee.com/anolis/cloud-kernel/pulls/1968 Signed-off-by: Weilin Tong Reviewed-by: Baolin Wang Link: https://gitee.com/anolis/cloud-kernel/pulls/4219 Signed-off-by: Kaihao Bai --- mm/page_alloc.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 16bba674706f..f6c30b00cd87 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6592,12 +6592,22 @@ static int sysctl_min_cache_kbytes_sysctl_handler(const struct ctl_table *table, void __user *buffer, size_t *length, loff_t *ppos) { int rc; + unsigned long min_cache_pages; + unsigned long old_min_cache_kbytes = sysctl_min_cache_kbytes; rc = proc_doulongvec_minmax(table, write, buffer, length, ppos); if (rc) return rc; - setup_min_cache_kbytes(); + if (write) { + min_cache_pages = sysctl_min_cache_kbytes >> (PAGE_SHIFT - 10); + if (min_cache_pages > totalram_pages() / 2) { + sysctl_min_cache_kbytes = old_min_cache_kbytes; + return -EINVAL; + } + + setup_min_cache_kbytes(); + } return 0; } -- Gitee From 095d6f5b667894c66592876d27774044fa1b87c0 Mon Sep 17 00:00:00 2001 From: Kaihao Bai Date: Mon, 11 Dec 2023 17:33:15 +0800 Subject: [PATCH 4/6] anolis: mm: fix the judgement of min_pagecache_bytes ANBZ: #12338 ANBZ: #7710 The min_pagecache feature is only added with a condition check in SCAN_FILE/SCAN_ANON. In this way, other cases will still reclaim file pages. Signed-off-by: Kaihao Bai Reviewed-by: Xu Yu Link: https://gitee.com/anolis/cloud-kernel/pulls/2499 Signed-off-by: Weilin Tong Reviewed-by: Baolin Wang Link: https://gitee.com/anolis/cloud-kernel/pulls/4219 --- mm/vmscan.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 336429255016..2538fff369d2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2620,14 +2620,15 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, /* Scan one type exclusively */ if ((scan_balance == SCAN_FILE) != file) scan = 0; - else if (sc->file_is_reserved && file) - scan = 0; break; default: /* Look ma, no brain */ BUG(); } + if (sc->file_is_reserved && file) + scan = 0; + nr[lru] = scan; } } -- Gitee From fd6d2a224097ecd6c13a3c9365f309fe6e2b85f4 Mon Sep 17 00:00:00 2001 From: Yuanhe Shu Date: Tue, 29 Oct 2024 15:00:11 +0800 Subject: [PATCH 5/6] anolis: mm: set default value for min_cache_kbytes ANBZ: #12338 ANBZ: #11569 Set the default value for min_cache_kbytes to resolve the issue of not triggering OOM. min_cache_kbytes would be set to: 150M, when total memory is ( 0, 4G] 300M, when total memory is ( 4G, 8G] 400M, when total memory is ( 8G, 16G] 500M, when total memory is (16G, 128G] 1024M, when total memory is above 128G Signed-off-by: Yuanhe Shu Reviewed-by: Baolin Wang Reviewed-by: Xu Yu Link: https://gitee.com/anolis/cloud-kernel/pulls/4044 Signed-off-by: Weilin Tong Link: https://gitee.com/anolis/cloud-kernel/pulls/4219 Signed-off-by: Kaihao Bai --- include/linux/mm.h | 1 + mm/memory_hotplug.c | 2 ++ mm/page_alloc.c | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 95ad2c61151c..a8c0ff698ddd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4016,6 +4016,7 @@ static inline int early_pfn_to_nid(unsigned long pfn) extern int __meminit early_pfn_to_nid(unsigned long pfn); #endif +extern int __meminit init_min_cache_kbytes(void); extern void mem_init(void); extern void __init mmap_init(void); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2a943ec57c85..a86ccbbcd653 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1224,6 +1224,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages, /* reinitialise watermarks and update pcp limits */ init_per_zone_wmark_min(); + init_min_cache_kbytes(); kswapd_run(nid); kcompactd_run(nid); @@ -2060,6 +2061,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages, /* reinitialise watermarks and update pcp limits */ init_per_zone_wmark_min(); + init_min_cache_kbytes(); /* * Check whether this operation removes the last normal memory from diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f6c30b00cd87..dca306081415 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6588,6 +6588,41 @@ static void setup_min_cache_kbytes(void) } } +/* + * Initialise min_cache_kbytes. + * + * 0 < total memory <= 4G, min_cache_kbytes: 150M + * 4G < total memory <= 8G, min_cache_kbytes: 300M + * 8G < total memory <= 16G, min_cache_kbytes: 400M + * 16G < total memory <= 128G, min_cache_kbytes: 500M + * total memory > 128G, min_cache_kbytes: 1024M + */ + +int __meminit init_min_cache_kbytes(void) +{ + unsigned long total_ram_bytes = totalram_pages() << PAGE_SHIFT; + + if (total_ram_bytes <= 4UL * SZ_1G) + /* limit min_cache_kbytes to 1/2 of total memory at most */ + if (total_ram_bytes / 2 < 150 * SZ_1M) + sysctl_min_cache_kbytes = total_ram_bytes / 2 / SZ_1K; + else + sysctl_min_cache_kbytes = 150 * SZ_1K; + else if (total_ram_bytes <= 8UL * SZ_1G) + sysctl_min_cache_kbytes = 300 * SZ_1K; + else if (total_ram_bytes <= 16UL * SZ_1G) + sysctl_min_cache_kbytes = 400 * SZ_1K; + else if (total_ram_bytes <= 128UL * SZ_1G) + sysctl_min_cache_kbytes = 500 * SZ_1K; + else + sysctl_min_cache_kbytes = 1024 * SZ_1K; + + setup_min_cache_kbytes(); + + return 0; +} +postcore_initcall(init_min_cache_kbytes) + static int sysctl_min_cache_kbytes_sysctl_handler(const struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { -- Gitee From c90ca95b3a05678564edcf2b4d525df5a10979d1 Mon Sep 17 00:00:00 2001 From: Kaihao Bai Date: Thu, 16 Apr 2026 11:45:44 +0800 Subject: [PATCH 6/6] anolis: mm: exclude dirty pages from min_cache_kbytes protection ANBZ: #12338 ANBZ: #33265 min_cache_kbytes is intended to protect hot code segments and shared libraries from reclaim, which are typically clean file pages. However, the current implementation counts both clean and dirty file pages toward the watermark, causing dirty pages to incorrectly consume the reserved quota and leaving hot clean file pages vulnerable to eviction. Only account clean file pages against the min_cache_kbytes watermark, excluding dirty pages which are handled independently via the writeback path. Signed-off-by: Kaihao Bai Reviewed-by: Baolin Wang Link: https://gitee.com/anolis/cloud-kernel/pulls/6747 --- mm/vmscan.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 2538fff369d2..0a224683cf57 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -887,6 +887,11 @@ static enum folio_references folio_check_references(struct folio *folio, */ if (referenced_ptes == -1) return FOLIOREF_KEEP; + /* + * Activate file-backed executable folios if min_cache_kbytes is enabled. + */ + if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio) && sc->file_is_reserved) + return FOLIOREF_ACTIVATE; if (lru_gen_enabled() && !lru_gen_switching()) { if (!referenced_ptes) @@ -2300,7 +2305,7 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) struct lruvec *target_lruvec; if (lru_gen_enabled() && !lru_gen_switching()) - return; + goto file_reserved; target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); @@ -2361,6 +2366,7 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) else sc->cache_trim_mode = 0; +file_reserved: /* * Prevent the reclaimer from falling into the cache trap: as * cache pages start out inactive, every cache fault will tip @@ -2403,8 +2409,13 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) * given watermark. */ min_cache_kbytes = READ_ONCE(sysctl_min_cache_kbytes); - if (min_cache_kbytes) + if (min_cache_kbytes && !sc->file_is_reserved) { + unsigned long f_dirty; + + f_dirty = node_page_state(pgdat, NR_FILE_DIRTY); + file = (file > f_dirty) ? file - f_dirty : 0; sc->file_is_reserved = file <= pgdat->min_cache_pages; + } } } @@ -2626,9 +2637,6 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, BUG(); } - if (sc->file_is_reserved && file) - scan = 0; - nr[lru] = scan; } } @@ -6178,6 +6186,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) if ((lru_gen_enabled() || lru_gen_switching()) && root_reclaim(sc)) { memset(&sc->nr, 0, sizeof(sc->nr)); + prepare_scan_control(pgdat, sc); lru_gen_shrink_node(pgdat, sc); if (!lru_gen_switching()) -- Gitee