Linux内核中的cgroups v2资源管理技术-CSDN博客

Linux内核中的cgroups v2资源管理技术

什么是cgroups？

cgroups（Control Groups）是Linux内核提供的一种资源管理机制，它允许我们对进程组进行资源限制、优先级设置、资源使用监控等操作。cgroups是容器技术（如Docker、Kubernetes）的核心基础之一，它为容器提供了资源隔离和限制的能力。

cgroups有两个主要版本：

cgroups v1：较早的版本，提供了基本的资源管理功能
cgroups v2：较新的版本，提供了更统一、更灵活的资源管理接口

cgroups v2的设计目标

cgroups v2的设计目标包括：

统一的层次结构：提供一个统一的层次化结构，简化资源管理
更好的资源隔离：更严格的资源隔离，避免资源竞争
更灵活的资源分配：支持动态调整资源分配
更简单的API：简化用户空间接口，降低使用复杂度
更好的安全性：增强安全特性，防止资源逃逸

cgroups v2的核心概念

1. 层次结构

cgroups v2使用一个统一的层次结构来组织进程和资源控制器：

根cgroup：整个系统的根cgroup，所有进程默认属于根cgroup
子cgroup：根cgroup下可以创建多个子cgroup，形成树状结构
进程分配：每个进程只能属于一个cgroup，但可以从一个cgroup迁移到另一个cgroup

2. 资源控制器

cgroups v2支持多种资源控制器，用于管理不同类型的资源：

cpu：控制CPU时间分配
cpuset：控制CPU和内存节点的分配
memory：控制内存使用
io：控制块设备I/O
pids：控制进程数量
hugetlb：控制大页内存使用
rdma：控制RDMA资源使用

3. 挂载和管理

cgroups v2通过文件系统接口进行管理：

挂载cgroup2文件系统：使用mount -t cgroup2 none /sys/fs/cgroup
创建cgroup：在挂载点下创建目录
管理进程：将进程ID写入cgroup.procs文件
配置资源：通过写入相应的配置文件来设置资源限制

cgroups v2的使用方法

1. 基本操作

挂载cgroup2文件系统

# 挂载cgroup2文件系统
mount -t cgroup2 none /sys/fs/cgroup

# 查看挂载情况
mount | grep cgroup

创建和管理cgroup

# 创建一个名为myapp的cgroup
mkdir /sys/fs/cgroup/myapp

# 将进程ID 1234加入到myapp cgroup
echo 1234 > /sys/fs/cgroup/myapp/cgroup.procs

# 查看myapp cgroup中的进程
cat /sys/fs/cgroup/myapp/cgroup.procs

# 删除myapp cgroup（需要先移走所有进程）
rmdir /sys/fs/cgroup/myapp

2. 资源限制配置

CPU限制

# 设置CPU权重（相对值，默认100）
echo 500 > /sys/fs/cgroup/myapp/cpu.weight

# 设置CPU最大使用限制（单位：微秒/毫秒，1000000表示100%）
echo "max 100000" > /sys/fs/cgroup/myapp/cpu.max

# 设置CPU利用率限制（单位：百分比）
echo "75000 100000" > /sys/fs/cgroup/myapp/cpu.max

内存限制

# 设置内存使用上限
 echo "256M" > /sys/fs/cgroup/myapp/memory.max

# 设置内存软限制
 echo "128M" > /sys/fs/cgroup/myapp/memory.high

# 查看内存使用情况
 cat /sys/fs/cgroup/myapp/memory.current

I/O限制

# 设置块设备的I/O权重
echo "8:0 500" > /sys/fs/cgroup/myapp/io.weight

# 设置读操作的速率限制（单位：字节/秒）
echo "8:0 rbps=1048576" > /sys/fs/cgroup/myapp/io.max

# 设置写操作的速率限制
echo "8:0 wbps=2097152" > /sys/fs/cgroup/myapp/io.max

进程数量限制

# 设置最大进程数量
echo 100 > /sys/fs/cgroup/myapp/pids.max

# 查看当前进程数量
cat /sys/fs/cgroup/myapp/pids.current

cgroups v2的编程接口

1. 系统调用

cgroups v2提供了以下系统调用：

mkdir()/rmdir()：创建和删除cgroup
open()/read()/write()：读取和写入cgroup配置文件
getpid()/getppid()：获取进程ID
write()：将进程加入cgroup

2. 示例代码

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>

// 创建cgroup
int create_cgroup(const char *path) {
    return mkdir(path, 0755);
}

// 将进程加入cgroup
int add_process_to_cgroup(const char *cgroup_path, pid_t pid) {
    char procs_path[256];
    char pid_str[16];
    int fd;
    
    snprintf(procs_path, sizeof(procs_path), "%s/cgroup.procs", cgroup_path);
    fd = open(procs_path, O_WRONLY);
    if (fd < 0) {
        perror("open");
        return -1;
    }
    
    snprintf(pid_str, sizeof(pid_str), "%d", pid);
    if (write(fd, pid_str, strlen(pid_str)) < 0) {
        perror("write");
        close(fd);
        return -1;
    }
    
    close(fd);
    return 0;
}

// 设置CPU限制
int set_cpu_limit(const char *cgroup_path, int weight, int max_us) {
    char cpu_weight_path[256];
    char cpu_max_path[256];
    char weight_str[16];
    char max_str[32];
    int fd;
    
    // 设置CPU权重
    snprintf(cpu_weight_path, sizeof(cpu_weight_path), "%s/cpu.weight", cgroup_path);
    fd = open(cpu_weight_path, O_WRONLY);
    if (fd < 0) {
        perror("open cpu.weight");
        return -1;
    }
    
    snprintf(weight_str, sizeof(weight_str), "%d", weight);
    if (write(fd, weight_str, strlen(weight_str)) < 0) {
        perror("write cpu.weight");
        close(fd);
        return -1;
    }
    close(fd);
    
    // 设置CPU最大使用限制
    snprintf(cpu_max_path, sizeof(cpu_max_path), "%s/cpu.max", cgroup_path);
    fd = open(cpu_max_path, O_WRONLY);
    if (fd < 0) {
        perror("open cpu.max");
        return -1;
    }
    
    snprintf(max_str, sizeof(max_str), "%d 1000000", max_us);
    if (write(fd, max_str, strlen(max_str)) < 0) {
        perror("write cpu.max");
        close(fd);
        return -1;
    }
    close(fd);
    
    return 0;
}

// 设置内存限制
int set_memory_limit(const char *cgroup_path, const char *max) {
    char memory_max_path[256];
    int fd;
    
    snprintf(memory_max_path, sizeof(memory_max_path), "%s/memory.max", cgroup_path);
    fd = open(memory_max_path, O_WRONLY);
    if (fd < 0) {
        perror("open memory.max");
        return -1;
    }
    
    if (write(fd, max, strlen(max)) < 0) {
        perror("write memory.max");
        close(fd);
        return -1;
    }
    
    close(fd);
    return 0;
}

int main() {
    const char *cgroup_path = "/sys/fs/cgroup/myapp";
    pid_t pid = getpid();
    
    // 创建cgroup
    if (create_cgroup(cgroup_path) < 0) {
        fprintf(stderr, "Failed to create cgroup\n");
        return 1;
    }
    
    // 将当前进程加入cgroup
    if (add_process_to_cgroup(cgroup_path, pid) < 0) {
        fprintf(stderr, "Failed to add process to cgroup\n");
        return 1;
    }
    
    // 设置资源限制
    if (set_cpu_limit(cgroup_path, 500, 500000) < 0) {
        fprintf(stderr, "Failed to set CPU limit\n");
        return 1;
    }
    
    if (set_memory_limit(cgroup_path, "256M") < 0) {
        fprintf(stderr, "Failed to set memory limit\n");
        return 1;
    }
    
    printf("Process %d added to cgroup %s with resource limits\n", pid, cgroup_path);
    
    // 保持进程运行
    while (1) {
        sleep(1);
    }
    
    return 0;
}

cgroups v2与cgroups v1的区别

1. 层次结构

cgroups v1：每个资源控制器有独立的层次结构
cgroups v2：所有资源控制器共享一个层次结构

2. 资源分配模型

cgroups v1：使用配额和限制的方式
cgroups v2：使用统一的比例分配模型

3. 接口设计

cgroups v1：每个资源控制器有独立的文件接口
cgroups v2：提供统一的文件接口，简化管理

4. 安全性

cgroups v1：存在一些安全漏洞，如资源逃逸
cgroups v2：增强了安全特性，提供更严格的资源隔离

5. 兼容性

cgroups v1：被广泛使用，大多数容器运行时支持
cgroups v2：较新，正在被越来越多的系统和容器运行时采用

实际应用场景

1. 容器管理

cgroups v2是现代容器运行时（如Docker、containerd）的基础：

为每个容器提供资源限制
确保容器之间的资源隔离
防止单个容器消耗过多系统资源

2. 服务器资源管理

在多用户或多服务的服务器上：

为不同的服务分配资源份额
限制单个服务的资源使用
确保关键服务的资源需求

3. 边缘设备管理

在资源受限的边缘设备上：

合理分配有限的资源
确保核心功能的资源需求
防止非关键进程消耗过多资源

4. 高性能计算

在高性能计算环境中：

为不同的计算任务分配资源
确保计算任务的资源需求
提高资源利用率

性能优化建议

1. 合理设置资源限制

根据应用的实际需求设置资源限制
避免设置过于严格的限制，导致应用性能下降
定期监控和调整资源限制

2. 优化cgroup层次结构

保持cgroup层次结构简单，避免过深的嵌套
合理组织cgroup，根据应用类型和资源需求进行分组
避免创建过多的cgroup，减少管理开销

3. 监控资源使用

定期监控cgroup的资源使用情况
使用工具如cgroup-monitor、systemd-cgtop等
根据监控结果调整资源分配

4. 结合其他技术

与namespaces结合使用，提供更完整的隔离
与seccomp结合使用，增强安全性
与Linux调度器结合使用，优化CPU分配

代码优化案例

1. 容器资源管理

// container_cgroup.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>

#define CGROUP_BASE "/sys/fs/cgroup"

// 创建容器的cgroup
int create_container_cgroup(const char *container_id) {
    char cgroup_path[256];
    snprintf(cgroup_path, sizeof(cgroup_path), "%s/%s", CGROUP_BASE, container_id);
    return mkdir(cgroup_path, 0755);
}

// 配置容器的资源限制
int configure_container_resources(const char *container_id, 
                                 const char *cpu_weight, 
                                 const char *memory_max, 
                                 const char *pids_max) {
    char cgroup_path[256];
    char file_path[256];
    int fd;
    
    snprintf(cgroup_path, sizeof(cgroup_path), "%s/%s", CGROUP_BASE, container_id);
    
    // 设置CPU权重
    snprintf(file_path, sizeof(file_path), "%s/cpu.weight", cgroup_path);
    fd = open(file_path, O_WRONLY);
    if (fd >= 0) {
        write(fd, cpu_weight, strlen(cpu_weight));
        close(fd);
    }
    
    // 设置内存限制
    snprintf(file_path, sizeof(file_path), "%s/memory.max", cgroup_path);
    fd = open(file_path, O_WRONLY);
    if (fd >= 0) {
        write(fd, memory_max, strlen(memory_max));
        close(fd);
    }
    
    // 设置进程数量限制
    snprintf(file_path, sizeof(file_path), "%s/pids.max", cgroup_path);
    fd = open(file_path, O_WRONLY);
    if (fd >= 0) {
        write(fd, pids_max, strlen(pids_max));
        close(fd);
    }
    
    return 0;
}

// 将进程加入容器的cgroup
int add_process_to_container(const char *container_id, pid_t pid) {
    char cgroup_path[256];
    char procs_path[256];
    char pid_str[16];
    int fd;
    
    snprintf(cgroup_path, sizeof(cgroup_path), "%s/%s", CGROUP_BASE, container_id);
    snprintf(procs_path, sizeof(procs_path), "%s/cgroup.procs", cgroup_path);
    
    fd = open(procs_path, O_WRONLY);
    if (fd < 0) {
        perror("open");
        return -1;
    }
    
    snprintf(pid_str, sizeof(pid_str), "%d", pid);
    if (write(fd, pid_str, strlen(pid_str)) < 0) {
        perror("write");
        close(fd);
        return -1;
    }
    
    close(fd);
    return 0;
}

// 清理容器的cgroup
int cleanup_container_cgroup(const char *container_id) {
    char cgroup_path[256];
    snprintf(cgroup_path, sizeof(cgroup_path), "%s/%s", CGROUP_BASE, container_id);
    return rmdir(cgroup_path);
}

int main(int argc, char *argv[]) {
    if (argc != 5) {
        fprintf(stderr, "Usage: %s <container_id> <cpu_weight> <memory_max> <pids_max>\n", argv[0]);
        return 1;
    }
    
    const char *container_id = argv[1];
    const char *cpu_weight = argv[2];
    const char *memory_max = argv[3];
    const char *pids_max = argv[4];
    
    // 创建cgroup
    if (create_container_cgroup(container_id) < 0) {
        fprintf(stderr, "Failed to create container cgroup\n");
        return 1;
    }
    
    // 配置资源限制
    if (configure_container_resources(container_id, cpu_weight, memory_max, pids_max) < 0) {
        fprintf(stderr, "Failed to configure container resources\n");
        return 1;
    }
    
    // 将当前进程加入cgroup
    if (add_process_to_container(container_id, getpid()) < 0) {
        fprintf(stderr, "Failed to add process to container\n");
        return 1;
    }
    
    printf("Container %s created with resources: cpu_weight=%s, memory_max=%s, pids_max=%s\n", 
           container_id, cpu_weight, memory_max, pids_max);
    
    // 模拟容器运行
    while (1) {
        sleep(1);
    }
    
    return 0;
}

2. 服务资源管理

// service_cgroup.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>

#define CGROUP_BASE "/sys/fs/cgroup/services"

// 初始化服务cgroup层次结构
int init_service_cgroups() {
    return mkdir(CGROUP_BASE, 0755);
}

// 创建服务的cgroup
int create_service_cgroup(const char *service_name) {
    char cgroup_path[256];
    snprintf(cgroup_path, sizeof(cgroup_path), "%s/%s", CGROUP_BASE, service_name);
    return mkdir(cgroup_path, 0755);
}

// 配置服务的资源限制
int configure_service_resources(const char *service_name, 
                               const char *cpu_weight, 
                               const char *memory_max, 
                               const char *io_weight) {
    char cgroup_path[256];
    char file_path[256];
    int fd;
    
    snprintf(cgroup_path, sizeof(cgroup_path), "%s/%s", CGROUP_BASE, service_name);
    
    // 设置CPU权重
    snprintf(file_path, sizeof(file_path), "%s/cpu.weight", cgroup_path);
    fd = open(file_path, O_WRONLY);
    if (fd >= 0) {
        write(fd, cpu_weight, strlen(cpu_weight));
        close(fd);
    }
    
    // 设置内存限制
    snprintf(file_path, sizeof(file_path), "%s/memory.max", cgroup_path);
    fd = open(file_path, O_WRONLY);
    if (fd >= 0) {
        write(fd, memory_max, strlen(memory_max));
        close(fd);
    }
    
    // 设置I/O权重
    snprintf(file_path, sizeof(file_path), "%s/io.weight", cgroup_path);
    fd = open(file_path, O_WRONLY);
    if (fd >= 0) {
        write(fd, io_weight, strlen(io_weight));
        close(fd);
    }
    
    return 0;
}

// 将服务进程加入cgroup
int add_service_process(const char *service_name, pid_t pid) {
    char cgroup_path[256];
    char procs_path[256];
    char pid_str[16];
    int fd;
    
    snprintf(cgroup_path, sizeof(cgroup_path), "%s/%s", CGROUP_BASE, service_name);
    snprintf(procs_path, sizeof(procs_path), "%s/cgroup.procs", cgroup_path);
    
    fd = open(procs_path, O_WRONLY);
    if (fd < 0) {
        perror("open");
        return -1;
    }
    
    snprintf(pid_str, sizeof(pid_str), "%d", pid);
    if (write(fd, pid_str, strlen(pid_str)) < 0) {
        perror("write");
        close(fd);
        return -1;
    }
    
    close(fd);
    return 0;
}

int main(int argc, char *argv[]) {
    if (argc != 5) {
        fprintf(stderr, "Usage: %s <service_name> <cpu_weight> <memory_max> <io_weight>\n", argv[0]);
        return 1;
    }
    
    const char *service_name = argv[1];
    const char *cpu_weight = argv[2];
    const char *memory_max = argv[3];
    const char *io_weight = argv[4];
    
    // 初始化服务cgroup层次结构
    init_service_cgroups();
    
    // 创建服务cgroup
    if (create_service_cgroup(service_name) < 0) {
        fprintf(stderr, "Failed to create service cgroup\n");
        return 1;
    }
    
    // 配置资源限制
    if (configure_service_resources(service_name, cpu_weight, memory_max, io_weight) < 0) {
        fprintf(stderr, "Failed to configure service resources\n");
        return 1;
    }
    
    // 将当前进程加入cgroup
    if (add_service_process(service_name, getpid()) < 0) {
        fprintf(stderr, "Failed to add process to service cgroup\n");
        return 1;
    }
    
    printf("Service %s created with resources: cpu_weight=%s, memory_max=%s, io_weight=%s\n", 
           service_name, cpu_weight, memory_max, io_weight);
    
    // 模拟服务运行
    while (1) {
        sleep(1);
    }
    
    return 0;
}