diff --git a/include/linux/mm.h b/include/linux/mm.h index 95ad2c61151ce723b774561435d0155830235059..a8c0ff698dddc0c0fb6940965e78fbdbdab032b1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4016,6 +4016,7 @@ static inline int early_pfn_to_nid(unsigned long pfn) extern int __meminit early_pfn_to_nid(unsigned long pfn); #endif +extern int __meminit init_min_cache_kbytes(void); extern void mem_init(void); extern void __init mmap_init(void); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 03c5cd62dcb82a96da89bdf8c7486b1ef0f014b2..504c202cc4551b7dba7adcfcc677d25eefbe7069 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1552,6 +1552,7 @@ typedef struct pglist_data { */ unsigned long totalreserve_pages; + unsigned long min_cache_pages; #ifdef CONFIG_NUMA /* * node reclaim becomes active if more unmapped pages exist. diff --git a/include/linux/swap.h b/include/linux/swap.h index 50663eec7b181164eb8a04917dc3382e2cc90057..59f37e86872da0b40d7d9f6ac9f7fbe486816208 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -311,6 +311,7 @@ void workingset_activation(struct folio *folio); /* linux/mm/page_alloc.c */ extern unsigned long totalreserve_pages; +extern unsigned long sysctl_min_cache_kbytes; /* Definition of global_zone_page_state not available yet */ #define nr_free_pages() global_zone_page_state(NR_FREE_PAGES) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2a943ec57c85b7f423bd1a6cea9cf6ea84699a05..a86ccbbcd653460458e74a45a4395c50f1dbc82e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1224,6 +1224,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages, /* reinitialise watermarks and update pcp limits */ init_per_zone_wmark_min(); + init_min_cache_kbytes(); kswapd_run(nid); kcompactd_run(nid); @@ -2060,6 +2061,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages, /* reinitialise watermarks and update pcp limits */ init_per_zone_wmark_min(); + init_min_cache_kbytes(); /* * Check whether this operation removes the last normal memory from diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ae5da1fafb6f3d2c1e2de0cdf8de98e417074ac8..dca306081415674aaead6946f705800ee2f924ab 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6557,6 +6557,96 @@ static int watermark_scale_factor_sysctl_handler(const struct ctl_table *table, return 0; } +static void setup_min_cache_kbytes(void) +{ + pg_data_t *pgdat; + struct zone *zone; + unsigned long lowmem_pages = 0; + unsigned long min_cache_pages = sysctl_min_cache_kbytes >> (PAGE_SHIFT - 10); + + for_each_online_pgdat(pgdat) + pgdat->min_cache_pages = 0; + + for_each_zone(zone) { + if (!is_highmem(zone)) + lowmem_pages += zone_managed_pages(zone); + } + + for_each_zone(zone) { + u64 tmp; + + /* + * Make sure that lowmem zone reserve a mount of file pages + * to avoid thrashing. highmem zone is allowed to eat up + * memory as soon as possible. + */ + if (!is_highmem(zone)) { + tmp = zone_managed_pages(zone) * min_cache_pages; + do_div(tmp, lowmem_pages); + zone->zone_pgdat->min_cache_pages += tmp; + } + } +} + +/* + * Initialise min_cache_kbytes. + * + * 0 < total memory <= 4G, min_cache_kbytes: 150M + * 4G < total memory <= 8G, min_cache_kbytes: 300M + * 8G < total memory <= 16G, min_cache_kbytes: 400M + * 16G < total memory <= 128G, min_cache_kbytes: 500M + * total memory > 128G, min_cache_kbytes: 1024M + */ + +int __meminit init_min_cache_kbytes(void) +{ + unsigned long total_ram_bytes = totalram_pages() << PAGE_SHIFT; + + if (total_ram_bytes <= 4UL * SZ_1G) + /* limit min_cache_kbytes to 1/2 of total memory at most */ + if (total_ram_bytes / 2 < 150 * SZ_1M) + sysctl_min_cache_kbytes = total_ram_bytes / 2 / SZ_1K; + else + sysctl_min_cache_kbytes = 150 * SZ_1K; + else if (total_ram_bytes <= 8UL * SZ_1G) + sysctl_min_cache_kbytes = 300 * SZ_1K; + else if (total_ram_bytes <= 16UL * SZ_1G) + sysctl_min_cache_kbytes = 400 * SZ_1K; + else if (total_ram_bytes <= 128UL * SZ_1G) + sysctl_min_cache_kbytes = 500 * SZ_1K; + else + sysctl_min_cache_kbytes = 1024 * SZ_1K; + + setup_min_cache_kbytes(); + + return 0; +} +postcore_initcall(init_min_cache_kbytes) + +static int sysctl_min_cache_kbytes_sysctl_handler(const struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int rc; + unsigned long min_cache_pages; + unsigned long old_min_cache_kbytes = sysctl_min_cache_kbytes; + + rc = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + if (write) { + min_cache_pages = sysctl_min_cache_kbytes >> (PAGE_SHIFT - 10); + if (min_cache_pages > totalram_pages() / 2) { + sysctl_min_cache_kbytes = old_min_cache_kbytes; + return -EINVAL; + } + + setup_min_cache_kbytes(); + } + + return 0; +} + #ifdef CONFIG_NUMA static void setup_min_unmapped_ratio(void) { @@ -6735,6 +6825,14 @@ static const struct ctl_table page_alloc_sysctl_table[] = { .mode = 0644, .proc_handler = lowmem_reserve_ratio_sysctl_handler, }, + { + .procname = "min_cache_kbytes", + .data = &sysctl_min_cache_kbytes, + .maxlen = sizeof(sysctl_min_cache_kbytes), + .mode = 0644, + .proc_handler = sysctl_min_cache_kbytes_sysctl_handler, + .extra1 = SYSCTL_LONG_ZERO, + }, #ifdef CONFIG_NUMA { .procname = "numa_zonelist_order", diff --git a/mm/vmscan.c b/mm/vmscan.c index 2f7330b431394ecd47c5718b8d8e22ede261bba6..0a224683cf57af206319f258a6b4798a3035ce27 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -149,6 +149,9 @@ struct scan_control { /* Always discard instead of demoting to lower tier memory */ unsigned int no_demotion:1; + /* The file pages on the current node are not allowed to reclaim */ + unsigned int file_is_reserved:1; + /* Allocation order */ s8 order; @@ -197,6 +200,8 @@ struct scan_control { * From 0 .. MAX_SWAPPINESS. Higher means more swappy. */ int vm_swappiness = 60; +/* The min page cache should be reserved in the system */ +unsigned long sysctl_min_cache_kbytes; #ifdef CONFIG_MEMCG @@ -882,6 +887,11 @@ static enum folio_references folio_check_references(struct folio *folio, */ if (referenced_ptes == -1) return FOLIOREF_KEEP; + /* + * Activate file-backed executable folios if min_cache_kbytes is enabled. + */ + if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio) && sc->file_is_reserved) + return FOLIOREF_ACTIVATE; if (lru_gen_enabled() && !lru_gen_switching()) { if (!referenced_ptes) @@ -2295,7 +2305,7 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) struct lruvec *target_lruvec; if (lru_gen_enabled() && !lru_gen_switching()) - return; + goto file_reserved; target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); @@ -2356,6 +2366,7 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) else sc->cache_trim_mode = 0; +file_reserved: /* * Prevent the reclaimer from falling into the cache trap: as * cache pages start out inactive, every cache fault will tip @@ -2368,6 +2379,7 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) if (!cgroup_reclaim(sc)) { unsigned long total_high_wmark = 0; unsigned long free, anon; + unsigned long min_cache_kbytes; int z; struct zone *zone; @@ -2390,6 +2402,20 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) file + free <= total_high_wmark && !(sc->may_deactivate & DEACTIVATE_ANON) && anon >> sc->priority; + + /* + * Reserve a specified amount of page caches in case of thrashing. + * OOM killer is preferred when the system page cache is below the + * given watermark. + */ + min_cache_kbytes = READ_ONCE(sysctl_min_cache_kbytes); + if (min_cache_kbytes && !sc->file_is_reserved) { + unsigned long f_dirty; + + f_dirty = node_page_state(pgdat, NR_FILE_DIRTY); + file = (file > f_dirty) ? file - f_dirty : 0; + sc->file_is_reserved = file <= pgdat->min_cache_pages; + } } } @@ -6160,6 +6186,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) if ((lru_gen_enabled() || lru_gen_switching()) && root_reclaim(sc)) { memset(&sc->nr, 0, sizeof(sc->nr)); + prepare_scan_control(pgdat, sc); lru_gen_shrink_node(pgdat, sc); if (!lru_gen_switching())