mmap内核源码分析,基于内核版本3.10(二)

本文是mmap内核源码分析的第二部分,主要聚焦于get_unmapped_area函数。从get_unmapped_area的调用开始,通过搜索内核源码找到相关实现,并详细解释了结构体mm_struct、vm_area_struct和vm_unmapped_area_info。文章介绍了在addr非空和空时的不同处理,涉及find_vma函数和vm_unmapped_area函数,探讨了unmapped_area函数的工作原理,包括其遍历vma红黑树的过程。

在之前写了一篇mmap内核源码分析,基于内核版本3.10,经过几天再来回顾发现还是没能学习透彻,所以再写一篇(二)

mmap内核源码分析,基于内核版本3.10(一)博客地址:

https://blog.csdn.net/SweeNeil/article/details/83685812

我们接着从一中的get_unmapped_area函数讲起,get_unmaped_area函数定义如下所示:

unsigned long
get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
		unsigned long pgoff, unsigned long flags)
{
	unsigned long (*get_area)(struct file *, unsigned long,
				  unsigned long, unsigned long, unsigned long);

	unsigned long error = arch_mmap_check(addr, len, flags);
	if (error)
		return error;

	/* Careful about overflows.. */
	if (len > TASK_SIZE)
		return -ENOMEM;

	get_area = current->mm->get_unmapped_area;
	if (file && file->f_op && file->f_op->get_unmapped_area)
		get_area = file->f_op->get_unmapped_area;
	addr = get_area(file, addr, len, pgoff, flags);
	if (IS_ERR_VALUE(addr))
		return addr;

	if (addr > TASK_SIZE - len)
		return -ENOMEM;
	if (addr & ~PAGE_MASK)
		return -EINVAL;

	addr = arch_rebalance_pgtables(addr, len);
	error = security_mmap_addr(addr);
	return error ? error : addr;
}

在(一)中我们知道了函数get_unmaped_area()调用文件对象的get_unmaped_area方法,如果已定义,就为文件的内存映射分配一个合适的线性地址区间。磁盘文件系统不会定义这么一个方法,那么get_unmaped_area就会调用内存描述符的get_unmaped_area方法。

接下来我们在(一)讲解了如何找到get_unmaped_area,在(二)中接着这一步进行详细讲解,使用Search Project找到内核源码中所有的get_unmaped_area,我们就对这些所有找到的内容进行讲解以及对后续调用代码进行更加深入的学习。

最后我们找到了:---- get_unmapped_area Matches (78 in 55 files) ----

这里按文件进行分析:

1 、fs/bad_inode.c

.get_unmapped_area = bad_file_get_unmapped_area

bad_inode.c (fs) line 167 : 
	.sendpage	= bad_file_sendpage,
	.get_unmapped_area = bad_file_get_unmapped_area,
	.check_flags	= bad_file_check_flags,
	.flock		= bad_file_flock,
	.splice_write	= bad_file_splice_write,
bfin_capture.c (drivers\media\platform\blackfin) line 946 : 
#ifndef CONFIG_MMU
	.get_unmapped_area = bcap_get_unmapped_area,
#endif
	.poll = bcap_poll
};

2、drivers/video/fbmem.c

.get_unmapped_area    = ramfs_nommu_get_unmapped_area

fbmem.c (drivers\video) line 1486 : 
#ifdef HAVE_ARCH_FB_UNMAPPED_AREA
	.get_unmapped_area = get_fb_unmapped_area,
#endif
#ifdef CONFIG_FB_DEFERRED_IO
	.fsync =	fb_deferred_io_fsync,
file-nommu.c (fs\ramfs) line 39 : 
	.mmap			= ramfs_nommu_mmap,
	.get_unmapped_area	= ramfs_nommu_get_unmapped_area,
	.read			= do_sync_read,
	.aio_read		= generic_file_aio_read,
	.write			= do_sync_write,
spufs_get_unmapped_area in file.c (arch\powerpc\platforms\cell\spufs) : 
	if (!csa->use_big_pages)
		return current->mm->get_unmapped_area(file, addr, len,
						      pgoff, flags);

3、arch/powerpc/platforms/cell/spufs

.get_unmapped_area    = spufs_get_unmapped_area

file.c (arch\powerpc\platforms\cell\spufs) line 365 : 
#ifdef CONFIG_SPU_FS_64K_LS
	.get_unmapped_area	= spufs_get_unmapped_area,
#endif
};

4、include/linux/fs.h

hugetlb_get_unmapped_area

file_operations in fs.h (include\linux) : 
	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
	int (*check_flags)(int);
	int (*flock) (struct file *, int, struct file_lock *);
	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
hugetlb_get_unmapped_area in hugetlbpage.c (arch\sparc\mm) : 
	}
	if (mm->get_unmapped_area == arch_get_unmapped_area)
		return hugetlb_get_unmapped_area_bottomup(file, addr, len,
				pgoff, flags);
	else
hugetlb_get_unmapped_area in hugetlbpage.c (arch\tile\mm) : 
	}
	if (current->mm->get_unmapped_area == arch_get_unmapped_area)
		return hugetlb_get_unmapped_area_bottomup(file, addr, len,
				pgoff, flags);
	else
hugetlb_get_unmapped_area in hugetlbpage.c (arch\x86\mm) : 
	}
	if (mm->get_unmapped_area == arch_get_unmapped_area)
		return hugetlb_get_unmapped_area_bottomup(file, addr, len,
				pgoff, flags);
	else
inode.c (fs\hugetlbfs) line 708 : 
	.fsync			= noop_fsync,
	.get_unmapped_area	= hugetlb_get_unmapped_area,
	.llseek		= default_llseek,
};

5、fs/hugetlbfs

.get_unmapped_area    = hugetlb_get_unmapped_area

inode.c (fs\hugetlbfs) line 708 : 
	.fsync			= noop_fsync,
	.get_unmapped_area	= hugetlb_get_unmapped_area,
	.llseek		= default_llseek,
};

6、drivers/char/mem.c

.get_unmapped_area = get_unmapped_area_mem

mem.c (drivers\char) line 782 : 
	.open		= open_mem,
	.get_unmapped_area = get_unmapped_area_mem,
};

7、include/linux/mm.h

mm.h (include\linux) line 1511 : 

extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);

extern unsigned long mmap_region(struct file *file, unsigned long addr,
	unsigned long len, vm_flags_t vm_flags, unsigned long pgoff);

8、fs/romfs/mmap-nommu.c

.get_unmapped_area    = romfs_get_unmapped_area

mmap-nommu.c (fs\romfs) line 78 : 
	.mmap			= romfs_mmap,
	.get_unmapped_area	= romfs_get_unmapped_area,
};

9、mm/util.c

mm->get_unmapped_area = arch_get_unmapped_area

arch_pick_mmap_layout in util.c (mm) : 
	mm->mmap_base = TASK_UNMAPPED_BASE;
	mm->get_unmapped_area = arch_get_unmapped_area;
	mm->unmap_area = arch_unmap_area;
}

……

等等,还有一系列的对.get_unmapped_area的钩子赋值,如果其中有对应的get_unmmaped_area就调用对应的函数,如果没有那就需要调用内存描述符的get_unmmaped_area函数。我们可以看看内存描述符的结构:

struct mm_struct {
	struct vm_area_struct * mmap;		/* list of VMAs */
	struct rb_root mm_rb;
	struct vm_area_struct * mmap_cache;	/* last find_vma result */
#ifdef CONFIG_MMU
	unsigned long (*get_unmapped_area) (struct file *filp,
				unsigned long addr, unsigned long len,
				unsigned long pgoff, unsigned long flags);
	void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
#endif
	unsigned long mmap_base;		/* base of mmap area */
	unsigned long task_size;		/* size of task vm space */
	unsigned long cached_hole_size; 	/* if non-zero, the largest hole below free_area_cache */
	unsigned long free_area_cache;		/* first hole of size cached_hole_size or larger */
	unsigned long highest_vm_end;		/* highest vma end address */
	pgd_t * pgd;
	atomic_t mm_users;			/* How many users with user space? */
	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
	int map_count;				/* number of VMAs */
        ……
        ……
        ……
        ……
        ……
}

同样的我们需要对这个结构体的get_unmapped_area进行赋值,我们看其中的找到的一个,比如上述所述的第9个在mm/util.c文件中定义对这个get_unmapped_area的赋值其实是arch_get_unmapped_area函数,我们进入到这个函数中查看相关内容:

#ifndef HAVE_ARCH_UNMAPPED_AREA
unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
		unsigned long len, unsigned long pgoff, unsigned long flags)
{
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma;
	struct vm_unmapped_area_info info;

	if (len > TASK_SIZE)
		return -ENOMEM;

	if (flags & MAP_FIXED)
		return addr;

	if (addr) {
		addr = PAGE_ALIGN(addr);
		vma = find_vma(mm, addr);
		if (TASK_SIZE - len >= addr &&
		    (!vma || addr + len <= vma->vm_start))
			return addr;
	}

	info.flags = 0;
	info.length = len;
	info.low_limit = TASK_UNMAPPED_BASE;
	info.high_limit = TASK_SIZE;
	info.align_mask = 0;
	return vm_unmapped_area(&info);
}
#endif	

在arch_get_unmapped_area函数中定义了三个结构体

	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma;
	struct vm_unmapped_area_info info;

其中struct mm_struct表示内存描述符,将当前进程描述符的内容赋值给mm;

struct vm_area_struct 结构体用于实现线性区,Linux内核中,关于虚存管理的最基本单元应该是struct vm_area_struct了,它描述的是一段连续的、具有相同访问属性的虚存空间,该虚存空间的大小为物理内存页面的整数倍。

struct vm_unmapped_area_info 结构体,描述这样一个unmapped_area的结构体,在arch_get_unmapped_area中主要是对这个vm_unmapped_area_info结构体进行赋值,然后将这个结构体作为参数调用了vm_unmapped_area函数。

在arch_get_unmapped_area函数中当addr非空,表示指定了一个特定的优先选用地址,内核会检查该区域是否与现存区域重叠,由find_vma函数完成查找功能。

当addr为空或是指定的优先地址不满足分配条件时,内核必须遍历进程中可用的区域,设法找到一个大小适当的空闲区域,有vm_unmapped_area函数做实际的工作。

分析到这里,其实并不是arch_get_unmapped_area一个函数是这样,其他的函数同样是进行了相关操作。如果设置了addr则使用find_vma函数查看该区域是否可用。进入到find_vma查看函数内容:

/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
	struct vm_area_struct *vma = NULL;

	/* Check the cache first. */
	/* (Cache hit rate is typically around 35%.) */
	vma = ACCESS_ONCE(mm->mmap_cache);
	if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
		struct rb_node *rb_node;

		rb_node = mm->mm_rb.rb_node;
		vma = NULL;

		while (rb_node) {
			struct vm_area_struct *vma_tmp;

			vma_tmp = rb_entry(rb_node,
					   struct vm_area_struct, vm_rb);

			if (vma_tmp->vm_end > addr) {
				vma = vma_tmp;
				if (vma_tmp->vm_start <= addr)
					break;
				rb_node = rb_node->rb_left;
			} else
				rb_node = rb_node->rb_right;
		}
		if (vma)
			mm->mmap_cache = vma;
	}
	return vma;
}

find_vma函数寻找第一个满足 addr < vm_area_struct->vm_end 的vma区
* vma = NULL 在vma红黑树的右子树,addr 是所存在的所有线性区线性地址最大
* vma != NULL 一定是tmp == NULL (tmp在find_vma指向当前结点)跳出循环
 

如果addr不可用或者addr为空,则对vm_unmapped_area_info进行赋值然后调用vm_unmapped_area函数。然后我们进入到vm_unmapped_area函数中。

static inline unsigned long
vm_unmapped_area(struct vm_unmapped_area_info *info)
{
	if (!(info->flags & VM_UNMAPPED_AREA_TOPDOWN))
		return unmapped_area(info);
	else
		return unmapped_area_topdown(info);
}

vm_unmapped_area函数根据标志位的不同调用了不同的unmapped_area函数。

他们的区别在于unmapped_area函数完成从低地址向高地址创建新的映射,而unmapped_area_topdown函数完成从高地址向低地址创建新的映射。

我们选择查看unmapped_area函数,其内容如下:

unsigned long unmapped_area(struct vm_unmapped_area_info *info)
{
	/*
	 我们通过查找紧跟在合适间隙之后的rbtree节点来实现搜索。 那是,
  	 - gap_start = vma-> vm_prev-> vm_end <= info-> high_limit - length;
 	 - gap_end = vma-> vm_start> = info-> low_limit + length;
 	 - gap_end - gap_start> = length
	 */

	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma;
	unsigned long length, low_limit, high_limit, gap_start, gap_end;

	/* Adjust search length to account for worst case alignment overhead */
	length = info->length + info->align_mask;
	if (length < info->length)
		return -ENOMEM;

	/* Adjust search limits by the desired length */
	if (info->high_limit < length)
		return -ENOMEM;
	high_limit = info->high_limit - length;

	if (info->low_limit > high_limit)
		return -ENOMEM;
	low_limit = info->low_limit + length;

	/* Check if rbtree root looks promising */
	if (RB_EMPTY_ROOT(&mm->mm_rb))
		goto check_highest;
	vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
	if (vma->rb_subtree_gap < length)
		goto check_highest;

	while (true) {
		/* Visit left subtree if it looks promising */
		gap_end = vma->vm_start;
		if (gap_end >= low_limit && vma->vm_rb.rb_left) {
			struct vm_area_struct *left =
				rb_entry(vma->vm_rb.rb_left,
					 struct vm_area_struct, vm_rb);
			if (left->rb_subtree_gap >= length) {
				vma = left;
				continue;
			}
		}

		gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
check_current:
		/* Check if current node has a suitable gap */
		if (gap_start > high_limit)
			return -ENOMEM;
		if (gap_end >= low_limit && gap_end - gap_start >= length)
			goto found;

		/* Visit right subtree if it looks promising */
		if (vma->vm_rb.rb_right) {
			struct vm_area_struct *right =
				rb_entry(vma->vm_rb.rb_right,
					 struct vm_area_struct, vm_rb);
			if (right->rb_subtree_gap >= length) {
				vma = right;
				continue;
			}
		}

		/* Go back up the rbtree to find next candidate node */
		while (true) {
			struct rb_node *prev = &vma->vm_rb;
			if (!rb_parent(prev))
				goto check_highest;
			vma = rb_entry(rb_parent(prev),
				       struct vm_area_struct, vm_rb);
			if (prev == vma->vm_rb.rb_left) {
				gap_start = vma->vm_prev->vm_end;
				gap_end = vma->vm_start;
				goto check_current;
			}
		}
	}

check_highest:
	/* Check highest gap, which does not precede any rbtree node */
	gap_start = mm->highest_vm_end;
	gap_end = ULONG_MAX;  /* Only for VM_BUG_ON below */
	if (gap_start > high_limit)
		return -ENOMEM;

found:
	/* We found a suitable gap. Clip it with the original low_limit. */
	if (gap_start < info->low_limit)
		gap_start = info->low_limit;

	/* Adjust gap address to the desired alignment */
	gap_start += (info->align_offset - gap_start) & info->align_mask;

	VM_BUG_ON(gap_start + info->length > info->high_limit);
	VM_BUG_ON(gap_start + info->length > gap_end);
	return gap_start;
}

在unmapped_area函数中首先也定义了下面的数据:

	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma;
	unsigned long length, low_limit, high_limit, gap_start, gap_end;

mm指向当前进程描述符,定义了vm_area_struct结构体vma,在接下来的代码中做了一些校验和调整。

之后对vam进行了赋值,然后使用vm_area_struct结构体的元素rb_subtree_gap进行了与length的比较。

rb_subtree_gap是当前结点与其前驱结点之间空隙 和 当前结点其左右子树中的结点间的最大空隙的最大值。 
unmapped_area函数先检查进程虚拟地址空间中可用于映射空间的边界,不满足要求返回错误代号到上层应用程序。当满足时,执行以下操作,为了找到最小的空闲的虚拟地址空间满足这次分配请求,便于两个相邻的vma区合并。 

在while循环中 unmapped_area具体步骤如下:

1. 从vma红黑树的根开始遍历 
2. 若当前结点有左子树则遍历其左子树,否则指向其右孩子。 
3. 当某结点rb_subtree_gap可能是最后一个满足分配请求的空隙时,遍历结束。 
4. 检测这个结点,判断这个结点与其前驱结点之间的空隙是否满足分配请求。满足则跳出循环。 
5. 不满足分配请求时,指向其右孩子,判断其右孩子的rb_subtree_gap是否满足当前请求。 
6. 满足则返回到2。不满足,回退其父结点,返回到4

总结整个调用过程图示如下所示

 

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值