0%

Linux内核内存管理 - Memblock和Buddy System(2)

书接前文,本文介绍《Memblock和Buddy System》的第二篇,第一篇见前文

伙伴系统

Mem Block向Buddy System过渡

伙伴系统便是使用页为单位对内存进行管理的方法。伙伴系统接管前,处理建立mem_section结构,也必须先从Mem Block中释放出不再使用的内存交给伙伴系统管理。本文Figure 2中略有体现,实现这个过渡的函数是memblock_free_all:

1
2
3
4
5
6
7
8
9
void __init memblock_free_all(void)
{
unsigned long pages;

free_unused_memmap();
reset_all_zones_managed_pages();
pages = free_low_memory_core_early();
totalram_pages_add(pages);
}
  • free_unused_memmap 释放未使用mem_map内存。

  • reset_all_zones_managed_pages 作用是将所有节点所有区域的managed_pages自动设置为0(managed_pages表示被伙伴系统管理的页的数量)。

  • _free_low_memory_core_early_主要做两个动作:

    • 将reserve类型的memblock和明确标记为Memory None的内存对应的页做标记为reserved(PG_reserved)

    • 将Mem block类型为memory的区域free掉,并标记为Free页面

  • _totalram_pages_add_增加 _totalram_pages ,用于标记系统中可用总页数。

管理方式

伙伴系统的管理方式可以参考<Understanding the Linux® Virtual Memory Manager>的图:

伙伴系统管理图示

每个内存区域(zone),都有一个链表数组,数组元素用来存放 $2^{Order}$个页的链表。内存的分配和释放便围绕着这个表来管理。

数据结构

数据结构一文,我们已经介绍的struct page/struct zone/struct pglist_data等数据结构。我们回顾其中部分字段:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
//include/linux/mmzone.h
struct zone {
...
struct pglist_data *zone_pgdat;
...
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn;
...
atomic_long_t managed_pages; //managed_pages = present_pages - reserved_pages;
unsigned long spanned_pages; //spanned_pages = zone_end_pfn - zone_start_pfn;
unsigned long present_pages; //present_pages = spanned_pages - absent_pages(pages in holes);
#ifdef CONFIG_CMA
unsigned long cma_pages;
#endif
...
/* free areas of different sizes */
struct free_area free_area[MAX_ORDER];
...
} ____cacheline_internodealigned_in_smp;

zone

  • zone_pgdat: 表示该内存区域所在的内存结点
  • zone_start_pfn: 表示该内存区域的起始页帧号
  • managed_pages: 表示该区域内由伙伴系统管理的页数
  • spanned_pages: 表示该区域跨越的总页数
  • present_pages:表示该区域内去掉内存空洞的总页数 (含系统保留页)
  • free_area: 如前文,存放伙伴系统有关的可用区域。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
typedef struct pglist_data {
/*
* node_zones contains just the zones for THIS node. Not all of the
* zones may be populated, but it is the full list. It is referenced by
* this node's node_zonelists as well as other node's node_zonelists.
*/
struct zone node_zones[MAX_NR_ZONES];
/*
* node_zonelists contains references to all zones in all nodes.
* Generally the first zones will be references to this node's
* node_zones.
*/
struct zonelist node_zonelists[MAX_ZONELISTS];
...
wait_queue_head_t kswapd_wait;
...
struct task_struct *kswapd; /* Protected by
mem_hotplug_begin/end() */
int kswapd_order;
enum zone_type kswapd_highest_zoneidx;
int kswapd_failures; /* Number of 'reclaimed == 0' runs */
unsigned long totalreserve_pages;
...
} pg_data_t;

pglist_data

  • node_zones: 该数组存储该内存节点内所有的内存区域
  • node_zonelists:保护所有内存节点中所有内存区域的应用
  • kswapd_wait、kswapd、kswapd_order等: kswpad线程运行所需字段。

内存分配

内存分配使用alloc_pages*系列函数,其核心代码__alloc_pages代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
nodemask_t *nodemask)
{
....
....
if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac,
&alloc_gfp, &alloc_flags))
return NULL;
....
/* First allocation attempt */
page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
if (likely(page))
goto out;
...
page = __alloc_pages_slowpath(alloc_gfp, order, &ac);

out:
if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT) && page &&
unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) {
__free_pages(page, order);
page = NULL;
}

trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);

return page;
}

代码很多,但是核心部分就是下面三个函数:

  • prepare_alloc_pages: 主要作用是,分配前准备页面分配的上下文,特别是选取合适的内存节点的内存区域(Zone)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
int preferred_nid, nodemask_t *nodemask,
struct alloc_context *ac, gfp_t *alloc_gfp,
unsigned int *alloc_flags)
{
ac->highest_zoneidx = gfp_zone(gfp_mask);
ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
ac->nodemask = nodemask;
ac->migratetype = gfp_migratetype(gfp_mask);

........
/*
* The preferred zone is used for statistics but crucially it is
* also used as the starting point for the zonelist iterator. It
* may get reset for allocations that ignore memory policies.
*/
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->highest_zoneidx, ac->nodemask);

return true;
}
  • get_page_from_freelist: 作用是根据传入的分配参数不断尝试来分配内存。同样引用<Understanding the Linux® Virtual Memory Manager>的图,当所需order的页数不足时,会将更大order的free_area拆分来满足返回对应的page指针。当无法分配成功时,则返回NULL。

伙伴系统分配

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
const struct alloc_context *ac)
{
struct zoneref *z;
struct zone *zone;
struct pglist_data *last_pgdat_dirty_limit = NULL;
bool no_fallback;

retry:
/*
* Scan zonelist, looking for a zone with enough free.
* See also __cpuset_node_allowed() comment in kernel/cpuset.c.
*/
no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
z = ac->preferred_zoneref;
for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
ac->nodemask) {
struct page *page;
unsigned long mark;

if (cpusets_enabled() &&
(alloc_flags & ALLOC_CPUSET) &&
!__cpuset_zone_allowed(zone, gfp_mask))
continue;
if (ac->spread_dirty_pages) {
if (last_pgdat_dirty_limit == zone->zone_pgdat)
continue;

if (!node_dirty_ok(zone->zone_pgdat)) {
last_pgdat_dirty_limit = zone->zone_pgdat;
continue;
}
}

if (no_fallback && nr_online_nodes > 1 &&
zone != ac->preferred_zoneref->zone) {
int local_nid;

/*
* If moving to a remote node, retry but allow
* fragmenting fallbacks. Locality is more important
* than fragmentation avoidance.
*/
local_nid = zone_to_nid(ac->preferred_zoneref->zone);
if (zone_to_nid(zone) != local_nid) {
alloc_flags &= ~ALLOC_NOFRAGMENT;
goto retry;
}
}

mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
if (!zone_watermark_fast(zone, order, mark,
ac->highest_zoneidx, alloc_flags,
gfp_mask)) {
int ret;

...
/* Checked here to keep the fast path fast */
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (alloc_flags & ALLOC_NO_WATERMARKS)
goto try_this_zone;

if (!node_reclaim_enabled() ||
!zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
continue;

ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
switch (ret) {
case NODE_RECLAIM_NOSCAN:
/* did not scan */
continue;
case NODE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default:
/* did we reclaim enough */
if (zone_watermark_ok(zone, order, mark,
ac->highest_zoneidx, alloc_flags))
goto try_this_zone;

continue;
}
}

try_this_zone:
page = rmqueue(ac->preferred_zoneref->zone, zone, order,
gfp_mask, alloc_flags, ac->migratetype);
if (page) {
prep_new_page(page, order, gfp_mask, alloc_flags);

if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
reserve_highatomic_pageblock(page, zone, order);
return page;
} else {
...
}
}
if (no_fallback) {
alloc_flags &= ~ALLOC_NOFRAGMENT;
goto retry;
}

return NULL;
}
  • __alloc_pages_slowpath:如果get_page_from_freelist分配页面失败,则进行慢速分配。这个函数会尝试回收内存,采用以下顺序:
    • 触发kswapd尝试回收内存。
    • 如果回收失败,则尝试杀掉进程回收内存。

内存释放

内存释放最后会调用到*__free_one_page *:释放过程比较容易来讲,找到可以合并的Buddy页帧号向上一级Order合并直到不能合并,将合并好的页加入到对应Order的free_area。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
static inline void __free_one_page(struct page *page,
unsigned long pfn,
struct zone *zone, unsigned int order,
int migratetype, fpi_t fpi_flags)
{ .....
max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order);
.....
continue_merging:
while (order < max_order) {
if (compaction_capture(capc, page, order, migratetype)) {
__mod_zone_freepage_state(zone, -(1 << order),
migratetype);
return;
}
buddy_pfn = __find_buddy_pfn(pfn, order);
buddy = page + (buddy_pfn - pfn);

if (!pfn_valid_within(buddy_pfn))
goto done_merging;
if (!page_is_buddy(page, buddy, order))
goto done_merging;
/*
* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
* merge with it and move up one order.
*/
if (page_is_guard(buddy))
clear_page_guard(zone, buddy, order, migratetype);
else
del_page_from_free_list(buddy, zone, order);
combined_pfn = buddy_pfn & pfn;
page = page + (combined_pfn - pfn);
pfn = combined_pfn;
order++;
}
if (order < MAX_ORDER - 1) {
/* If we are here, it means order is >= pageblock_order.
* We want to prevent merge between freepages on isolate
* pageblock and normal pageblock. Without this, pageblock
* isolation could cause incorrect freepage or CMA accounting.
*
* We don't want to hit this code for the more frequent
* low-order merging.
*/
if (unlikely(has_isolate_pageblock(zone))) {
int buddy_mt;

buddy_pfn = __find_buddy_pfn(pfn, order);
buddy = page + (buddy_pfn - pfn);
buddy_mt = get_pageblock_migratetype(buddy);

if (migratetype != buddy_mt
&& (is_migrate_isolate(migratetype) ||
is_migrate_isolate(buddy_mt)))
goto done_merging;
}
max_order = order + 1;
goto continue_merging;
}

done_merging:
set_buddy_order(page, order);

if (fpi_flags & FPI_TO_TAIL)
to_tail = true;
else if (is_shuffle_order(order))
to_tail = shuffle_pick_tail();
else
to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);

if (to_tail)
add_to_free_list_tail(page, zone, order, migratetype);
else
add_to_free_list(page, zone, order, migratetype);

/* Notify page reporting subsystem of freed page */
if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
page_reporting_notify_free(order);
}

kswapd

内存分配一节,_alloc_pages_slowpath会触发kswapd来回收内存。kswapd在每个内存节点都有一个,其定义和代码如下。其实这里就是调用了balance_pgdat进行内存回收。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
int kswapd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
...
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
...
}
static int kswapd(void *p)
{
...
for ( ; ; ) {
bool ret;

alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
highest_zoneidx = kswapd_highest_zoneidx(pgdat,
highest_zoneidx);

kswapd_try_sleep:
kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
highest_zoneidx);
......
ret = try_to_freeze();
if (kthread_should_stop())
break;
......
trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
alloc_order);
reclaim_order = balance_pgdat(pgdat, alloc_order,
highest_zoneidx);
if (reclaim_order < alloc_order)
goto kswapd_try_sleep;
}

tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);

return 0;
}

总结

本文是自己学习Linux内存管理的简单梳理,介绍了:

  • Linux初始化早期的Memblock内存管理
  • Linux物理内存模型
  • Linux伙伴系统(Buddy Allocator)

希望也对您理解Linux的内存管理有一些帮助。

参考资料

  1. 内核代码. https://elixir.bootlin.com/linux/latest/source/.
  2. 内核手册. https://www.kernel.org/doc/html/latest/vm/index.html.
  3. Understanding the Linux® Virtual Memory Manager. Mel Gorman. http://ptgmedia.pearsoncmg.com/images/0131453483/downloads/gorman_book.pdf.