0%

Linux内核内存管理 - 文件mmap映射和反向映射

这是<Linux内核内存管理>系列的第八篇:

第一篇为内核内存管理过程知识点的的简单梳理

第二篇介绍了内核的数据结构

第三篇介绍了从内核第一行代码加载到跳转到C代码前的内存处理。

第四篇概览了初始化C代码中的内存处理

第五篇(上)第五篇(下)介绍了Memblock和伙伴系统分配器

第六篇介绍了内存检测工具KFence工作原理

第七篇介绍了进程内存分配malloc的原理

mmap和munmap

mmap()的主要作用是将文件(普通文件或者设备文件)映射到进程的内存地址空间中,让应用程序可以以读写内存的方式来访问文件。与之对应的操作是munmap()

一段示例代码来自维基百科如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#include <sys/types.h>
#include <sys/mman.h>
#include <err.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

int main(void)
{
const char str1[] = "string 1";
const char str2[] = "string 2";
pid_t parpid = getpid(), childpid;
int fd = -1;
char *anon, *zero;

if ((fd = open("/dev/zero", O_RDWR, 0)) == -1)
err(1, "open");

anon = (char*)mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_ANON|MAP_SHARED, -1, 0);
zero = (char*)mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);

if (anon == MAP_FAILED || zero == MAP_FAILED)
errx(1, "either mmap");

strcpy(anon, str1);
strcpy(zero, str1);

printf("PID %d:\tanonymous %s, zero-backed %s\n", parpid, anon, zero);
switch ((childpid = fork())) {
case -1:
err(1, "fork");
/* NOTREACHED */
case 0:
childpid = getpid();
printf("PID %d:\tanonymous %s, zero-backed %s\n", childpid, anon, zero);
sleep(3);

printf("PID %d:\tanonymous %s, zero-backed %s\n", childpid, anon, zero);
munmap(anon, 4096);
munmap(zero, 4096);
close(fd);
return EXIT_SUCCESS;
}

sleep(2);
strcpy(anon, str2);
strcpy(zero, str2);

printf("PID %d:\tanonymous %s, zero-backed %s\n", parpid, anon, zero);
munmap(anon, 4096);
munmap(zero, 4096);
close(fd);
return EXIT_SUCCESS;
}

上述代码的输出是:

1
2
3
4
PID 22475:      anonymous string 1, zero-backed string 1
PID 22476: anonymous string 1, zero-backed string 1
PID 22475: anonymous string 2, zero-backed string 2
PID 22476: anonymous string 2, zero-backed string 2

主要作用是创建了两个匿名映射,父进程和子进程可以通过匿名映射来访问共享的内存。

mmap和munmap的架构如图所示,与上篇文章中介绍的malloc()工作原理类似:

  • 应用程序通过libc的API mmap()munmap()来创建和销毁映射
  • libc执行对应的系统调用SYS_mmap和SYS_munmap等
  • mmap(),内核根据传入的地址,映射长度,文件信息进行VMA映射的创建
  • munmap()则根据地址信息进行VMA映射的删除

mmap and munmap

内核实现mmap的核心函数是do_mmap()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flags, unsigned long pgoff,
unsigned long *populate, struct list_head *uf)
{
struct mm_struct *mm = current->mm;
.....................
if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
if (!(file && path_noexec(&file->f_path)))
prot |= PROT_EXEC;

/* force arch specific MAP_FIXED handling in get_unmapped_area */
if (flags & MAP_FIXED_NOREPLACE)
flags |= MAP_FIXED;

if (!(flags & MAP_FIXED))
addr = round_hint_to_min(addr);

/* Careful about overflows.. */
len = PAGE_ALIGN(len);
if (!len)
return -ENOMEM;

/* offset overflow? */
if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
return -EOVERFLOW;

/* Too many mappings? */
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
.........
if (flags & MAP_NORESERVE) {
/* We honor MAP_NORESERVE if allowed to overcommit */
if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
vm_flags |= VM_NORESERVE;

/* hugetlb applies strict overcommit unless MAP_NORESERVE */
if (file && is_file_hugepages(file))
vm_flags |= VM_NORESERVE;
}

addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
if (!IS_ERR_VALUE(addr) &&
((vm_flags & VM_LOCKED) ||
(flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
*populate = len;
return addr;
}

该函数主要就是根据输入参数做一系列检查,并根据参数配置vm_flags,最终传入mmap_region()函数开始创建映射。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev, *merge;
int error;
struct rb_node **rb_link, *rb_parent;
unsigned long charged = 0;

/* Check against address space limit. */
if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
unsigned long nr_pages;
nr_pages = count_vma_pages_range(mm, addr, addr + len);

if (!may_expand_vm(mm, vm_flags,
(len >> PAGE_SHIFT) - nr_pages))
return -ENOMEM;
}

/* Clear old maps, set up prev, rb_link, rb_parent, and uf */
if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
return -ENOMEM;
/*
* Private writable mapping: check memory availability
*/
if (accountable_mapping(file, vm_flags)) {
charged = len >> PAGE_SHIFT;
if (security_vm_enough_memory_mm(mm, charged))
return -ENOMEM;
vm_flags |= VM_ACCOUNT;
}

vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
if (vma)
goto out;

vma = vm_area_alloc(mm);
if (!vma) {
error = -ENOMEM;
goto unacct_error;
}

vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_flags = vm_flags;
vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma->vm_pgoff = pgoff;

if (file) {
if (vm_flags & VM_SHARED) {
error = mapping_map_writable(file->f_mapping);
if (error)
goto free_vma;
}

vma->vm_file = get_file(file);
error = call_mmap(file, vma);
if (error)
goto unmap_and_free_vma;

WARN_ON_ONCE(addr != vma->vm_start);

addr = vma->vm_start;

/* If vm_flags changed after call_mmap(), we should try merge vma again
* as we may succeed this time.
*/
if (unlikely(vm_flags != vma->vm_flags && prev)) {
merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX);
if (merge) {
/* ->mmap() can change vma->vm_file and fput the original file. So
* fput the vma->vm_file here or we would add an extra fput for file
* and cause general protection fault ultimately.
*/
fput(vma->vm_file);
vm_area_free(vma);
vma = merge;
/* Update vm_flags to pick up the change. */
vm_flags = vma->vm_flags;
goto unmap_writable;
}
}

vm_flags = vma->vm_flags;
} else if (vm_flags & VM_SHARED) {
error = shmem_zero_setup(vma);
if (error)
goto free_vma;
} else {
vma_set_anonymous(vma);
}

/* Allow architectures to sanity-check the vm_flags */
if (!arch_validate_flags(vma->vm_flags)) {
error = -EINVAL;
if (file)
goto unmap_and_free_vma;
else
goto free_vma;
}

vma_link(mm, vma, prev, rb_link, rb_parent);
/* Once vma denies write, undo our temporary denial count */
unmap_writable:
if (file && vm_flags & VM_SHARED)
mapping_unmap_writable(file->f_mapping);
file = vma->vm_file;
out:
perf_event_mmap(vma);

vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm))
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
else
mm->locked_vm += (len >> PAGE_SHIFT);
}

if (file)
uprobe_mmap(vma);

/*
* New (or expanded) vma always get soft dirty status.
* Otherwise user-space soft-dirty page tracker won't
* be able to distinguish situation when vma area unmapped,
* then new mapped in-place (which must be aimed as
* a completely new data area).
*/
vma->vm_flags |= VM_SOFTDIRTY;
vma_set_page_prot(vma);

return addr;
........
}

mmap_region()函数的实现也比较简单,这里不做过多解释。值得注意的是:如果传入的文件为空,则表示创建匿名映射。若连共享标记VM_SHARED也未指定,则与使用malloc()分配内存相同,仅为对应虚拟地址创建内存映射。

munmap()的内核实现仅为移除对应VMA映射,本文也不再做分析。

反向映射

反向映射的作用是给定物理页面,找到与其对应的所有进程的VMA。为什么会有这样的查找呢?这是因为所有进程的虚拟内存总大小往往远大于物理内存,为了支撑Linux系统的有效运作,内核在管理内存时,会将暂时不用的物理内存页换出到磁盘上,在有需要时再换入到内存中。

这种情况下,如何确定该物理内存有哪些进程正在使用?这便需要反向映射。

系统中内存页很多,在管理反向映射时,即使引入很小的数据结构,也会带来很大的额外内存开销。同时,因为反向映射使用比较频繁,也需要最优化查找效率,避免成为系统瓶颈。

回顾一下struct page,为了节省管理开销,其定义了很多联合体。其中与逆向映射有关的储存在mapping,_mapcount,index等成员中。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
struct page {
union {
struct { /* Page cache and anonymous pages */
/**
* @lru: Pageout list, eg. active_list protected by
* lruvec->lru_lock. Sometimes used as a generic list
* by the page owner.
*/
struct list_head lru;
/* See page-flags.h for PAGE_MAPPING_FLAGS */
struct address_space *mapping;
pgoff_t index; /* Our offset within mapping. */
/**
* @private: Mapping-private opaque data.
* Usually used for buffer_heads if PagePrivate.
* Used for swp_entry_t if PageSwapCache.
* Indicates order in the buddy system if PageBuddy.
*/
unsigned long private;
};
.............
};

union { /* This union is 4 bytes in size. */
/*
* If the page can be mapped to userspace, encodes the number
* of times this page is referenced by a page table.
*/
atomic_t _mapcount;

unsigned int page_type;

unsigned int active; /* SLAB */
int units; /* SLOB */
};
.............
} _struct_page_alignment;

内核文档对这种映射有个直观的描述如下图:

反向映射图示

简单一点讲:物理页结构体struct page使用mapping成员查找所有该页对应的VMA,从而找到所有正在使用该物理页的虚拟页。

mapping成员查找VMA的方法并非如上图那样容易理解。实际需要考虑很多情况,因此内核设计了如下数据结构:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
/*
* The anon_vma heads a list of private "related" vmas, to scan if
* an anonymous page pointing to this anon_vma needs to be unmapped:
* the vmas on the list will be related by forking, or by splitting.
*
* Since vmas come and go as they are split and merged (particularly
* in mprotect), the mapping field of an anonymous page cannot point
* directly to a vma: instead it points to an anon_vma, on whose list
* the related vmas can be easily linked or unlinked.
*
* After unlinking the last vma on the list, we must garbage collect
* the anon_vma object itself: we're guaranteed no page can be
* pointing to this anon_vma once its vma list is empty.
*/
struct anon_vma {
struct anon_vma *root; /* Root of this anon_vma tree */
struct rw_semaphore rwsem; /* W: modification, R: walking the list */
atomic_t refcount;
unsigned degree;
struct anon_vma *parent; /* Parent of this anon_vma */
struct rb_root_cached rb_root;
};

/*
* The copy-on-write semantics of fork mean that an anon_vma
* can become associated with multiple processes. Furthermore,
* each child process will have its own anon_vma, where new
* pages for that process are instantiated.
*
* This structure allows us to find the anon_vmas associated
* with a VMA, or the VMAs associated with an anon_vma.
* The "same_vma" list contains the anon_vma_chains linking
* all the anon_vmas associated with this VMA.
* The "rb" field indexes on an interval tree the anon_vma_chains
* which link all the VMAs associated with this anon_vma.
*/
struct anon_vma_chain {
struct vm_area_struct *vma;
struct anon_vma *anon_vma;
struct list_head same_vma; /* locked by mmap_lock & page_table_lock */
struct rb_node rb; /* locked by anon_vma->rwsem */
unsigned long rb_subtree_last;
#ifdef CONFIG_DEBUG_VM_RB
unsigned long cached_vma_start, cached_vma_last;
#endif
};

事实上,数据结构定义有描述为什么需要这样的数据结构而不是直接由mapping指向vma_area_struct。即:vm_area_struct可能会被合并、拆分等。

下图描述了当fork一个新进程时,反向映射相关字段的变化状况。

  • Fork新进程会将每个VM area(vm_area_struct)进行复制
  • 将每一个VM area的anon_vma_chain链表的anon_vma_chain进行复制,并与父进程的anon_vma关联
  • 对每个VM area,创建新的anon_vma和anon_vma_chain,与父进程的anon_vma关联

这样的关联建立起来后,通过物理页结构体struct page就可以查找到所有的关联VM area。

反向映射数据结构关系

结语

本文概要介绍了mmap和反向映射的原理。

  • mmap主要用于用户空间态进程映射一段虚拟地址,用以共享、分配内存或者使用访问内存的方式来访问文件节点。
  • 反向映射用于内核查找一个物理页面对应的所有虚拟地址,以便系统换页时使用。

内存管理系统内容纷繁,也是内核工作者集体智慧的结晶,笔者在理解时不免有遗失或者偏差之处。如您有问题或者建议,请留言提出讨论。