【数据结构】【cfs_rq】【task_struct】【sched_domain】
struct cfs_rq
在系统中至少有一个CFS运行队列,其就是根CFS运行队列,而其他的进程组 和进程都包含在此运行队列中,不同的是进程组又有它自己的CFS运行队列,其运行队列中包含的 是此进程组中的所有进程。当调度器从根CFS运行队列中选择了一个进程组进行调度时,进程组会 从自己的CFS运行队列中选择一个调度实体进行调度(这个调度实体可能为进程,也可能又是一个 子进程组),就这样一直深入,直到最后选出一个进程进行运行为止。
cfs_rq实际上是rq中与cfs相关的字段
/* CFS-related fields in a runqueue */
struct cfs_rq {
/*
该cfs_rq的load,它只计算它本层下面的se的weight之和,并不是这个se的load,也不是递归到叶子节点上的所有se weight之和(理解这点非常重要)*/struct load_weight load;/*所有进程的累计负荷值*/
//h_nr_running只对于组才有效,包括底层所有cfs_rq的nr_running之和unsigned int nr_running, h_nr_running;nr_running/*当前就绪队列的进程数*/u64 exec_clock;//该cfs_rq总共占用的cpu时间(物理),只累计本层
/** 当前CFS队列上最小运行时间,单调递增* 两种情况下更新该值: * 1、更新当前运行任务的累计运行时间时* 2、当任务从队列删除去,如任务睡眠或退出,这时候会查看剩下的任务的vruntime是否大于min_vruntime,如果是则更新该值。*/
//用于调整se的vruntime,它是递增的,但不一定是该cfs_rq里所有se最小u64 min_vruntime; //该cpu运行队列的vruntime推进值, 一般是红黑树中最小的vruntime值
#ifndef CONFIG_64BITu64 min_vruntime_copy;
#endifstruct rb_root tasks_timeline;/*红黑树的头结点*/struct rb_node *rb_leftmost;/*红黑树的最左面节点*//** 'curr' points to currently running entity on this cfs_rq.* It is set to NULL otherwise (i.e when none are currently running).*/
// current是正在被调用的实体对象
//当前运行的se(对于组虽然它不会在cpu上运行,但是当它的下层有一个task在cpu上运行,那么它所在的cfs_rq就把它当做是该cfs_rq上当前正在运行的se)struct sched_entity *curr, *next, *last, *skip;
/** 'curr' points to currently running entity on this cfs_rq.* It is set to NULL otherwise (i.e when none are currently running).* curr: 当前正在运行的sched_entity(对于组虽然它不会在cpu上运行,但是当它的下层有一个task在cpu上运行,那么它所在的cfs_rq就把它当做是该cfs_rq上当前正在运行的sched_entity)* next: 表示有些进程急需运行,即使不遵从CFS调度也必须运行它,调度时会检查是否next需要调度,有就调度next** skip: 略过进程(不会选择skip指定的进程调度)*/#ifdef CONFIG_SCHED_DEBUGunsigned int nr_spread_over;
#endif#ifdef CONFIG_SMP/** CFS load tracking*/struct sched_avg avg;u64 runnable_load_sum;unsigned long runnable_load_avg;
#ifdef CONFIG_64BIT_ONLY_CPUunsigned long runnable_load_avg_32bit;
#endif
#ifdef CONFIG_FAIR_GROUP_SCHEDunsigned long tg_load_avg_contrib;unsigned long propagate_avg;
#endifatomic_long_t removed_load_avg, removed_util_avg;
#ifndef CONFIG_64BITu64 load_last_update_time_copy;
#endif#ifdef CONFIG_FAIR_GROUP_SCHED/** h_load = weight * f(tg)** Where f(tg) is the recursive weight fraction assigned to* this group.*/unsigned long h_load;u64 last_h_load_update;struct sched_entity *h_load_next;
#endif /* CONFIG_FAIR_GROUP_SCHED */
#endif /* CONFIG_SMP */#ifdef CONFIG_FAIR_GROUP_SCHED/* 所属于的CPU rq */struct rq *rq; /* cpu runqueue to which this cfs_rq is attached *//** leaf cfs_rqs are those that hold tasks (lowest schedulable entity in* a hierarchy). Non-leaf lrqs hold other higher schedulable entities* (like users, containers etc.)** leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This* list is used during load balance.*/int on_list;struct list_head leaf_cfs_rq_list;
/*属于这个cfs_rq的进程组*/ struct task_group *tg; /* group that "owns" this runqueue */#ifdef CONFIG_SCHED_WALTu64 cumulative_runnable_avg;
#endif#ifdef CONFIG_CFS_BANDWIDTHint runtime_enabled;u64 runtime_expires;s64 runtime_remaining;u64 throttled_clock, throttled_clock_task;u64 throttled_clock_task_time;int throttled, throttle_count, throttle_uptodate;struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};
task_struct
每个task对应一个se,但是反过去不一定成立,因为有task_group的概念
struct task_struct
{......../* 表示是否在运行队列 */int on_rq;/* 进程优先级 * prio: 动态优先级,范围为100~139,与静态优先级和补偿(bonus)有关* static_prio: 静态优先级,static_prio = 100 + nice + 20 (nice值为-20~19,所以static_prio值为100~139)* normal_prio: 没有受优先级继承影响的常规优先级,具体见normal_prio函数,跟属于什么类型的进程有关*/int prio, static_prio, normal_prio;/* 实时进程优先级 */unsigned int rt_priority;/* 调度类,调度处理函数类 */const struct sched_class *sched_class;/* 调度实体(红黑树的一个结点) */struct sched_entity se; //通过这个调度实体可以找到对应的task/* 调度实体(实时调度使用) */struct sched_rt_entity rt;struct sched_dl_entity dl;#ifdef CONFIG_CGROUP_SCHED/* 指向其所在进程组 */struct task_group *sched_task_group;
#endif........
}
struct sched_domain
struct sched_domain {/* These fields must be setup */
//调用域可以被别的调用域所包含,parent指向父调用域 struct sched_domain *parent; /* top domain must be null terminated(终止) */struct sched_domain *child; /* bottom domain must be null terminated */// 指向正在均衡的groupstruct sched_group *groups; /* the balancing groups of the domain */
//最小的时间间隔,用于检查进行负载均衡操作的时机是否到了 unsigned long min_interval; /* Minimum balance interval ms */unsigned long max_interval; /* Maximum balance interval ms *///当处理器在不空闲的状态下时,进行负载均衡操作的时间间隔一般也长很多,该factor为其乘数银子 unsigned int busy_factor; /* less balancing by factor if busy *///busy时平衡因子
//判断该调度域是否已经均衡的一个基准值unsigned int imbalance_pct; /* No balance until over watermark(水印) */unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */unsigned int busy_idx;//忙均衡的cpu_load索引unsigned int idle_idx;//空闲均衡的cpu_load索引
//马上就要进入idle的cpu为了尽量不进入idle而进行负载均衡时的cpu_load索引 unsigned int newidle_idx;unsigned int wake_idx;unsigned int forkexec_idx;unsigned int smt_gain;
//进入nohz_idle模式的时候该值为1int nohz_idle; /* NOHZ IDLE status */int flags; /* See SD_* */int level; //domain 所处层次级别/* Runtime fields. */
//domain上次做balance时间unsigned long last_balance; /* init to jiffies. units in jiffies */
//每次balance的间隔时间unsigned int balance_interval; /* initialise to 1. units in ms. *///balance失败次数unsigned int nr_balance_failed; /* initialise to 0 *//* idle_balance() stats */
//这里的max_newidle_lb_cost是指做load balance所花时间。如上面注释所说,max_newidle_lb_cost每个1s衰减1%u64 max_newidle_lb_cost;
//是下一次进行衰减的时间,HZ为jiffies的1s时间unsigned long next_decay_max_lb_cost;#ifdef CONFIG_SCHEDSTATS/* load_balance() stats */unsigned int lb_count[CPU_MAX_IDLE_TYPES];unsigned int lb_failed[CPU_MAX_IDLE_TYPES];unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];unsigned int lb_gained[CPU_MAX_IDLE_TYPES];unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];/* Active load balancing */unsigned int alb_count;unsigned int alb_failed;unsigned int alb_pushed;/* SD_BALANCE_EXEC stats */unsigned int sbe_count;unsigned int sbe_balanced;unsigned int sbe_pushed;/* SD_BALANCE_FORK stats */unsigned int sbf_count;unsigned int sbf_balanced;unsigned int sbf_pushed;/* try_to_wake_up() stats */unsigned int ttwu_wake_remote;unsigned int ttwu_move_affine;unsigned int ttwu_move_balance;struct eas_stats eas_stats;
#endif
#ifdef CONFIG_SCHED_DEBUGchar *name;
#endifunion {void *private; /* used during construction */struct rcu_head rcu; /* used during destruction */};#ifdef CONFIG_INTEL_DWSunsigned int total_groups; /* total group number */unsigned int group_number; /* this CPU's group sequence */unsigned int dws_tf; /* consolidating degree */struct sched_group *first_group; /* ordered by CPU number */
#endifunsigned int span_weight;/** Span of all CPUs in this domain.** NOTE: this field is variable length. (Allocated dynamically* by attaching extra space to the end of the structure,* depending on how many CPUs the kernel has booted up with)*/unsigned long span[0];//当前 domain 中的所有 cpu 位图
};
4、struct sd_lb_stats
/** sd_lb_stats - Structure to store the statistics of a sched_domain* during load balancing.*/
struct sd_lb_stats {struct sched_group *busiest; /* Busiest group in this sd */struct sched_group *local; /* Local group in this sd */unsigned long total_load; /* Total load of all groups in sd */unsigned long total_capacity; /* Total capacity of all groups in sd *////domain中各個group的平均負載unsigned long avg_load; /* Average load across all groups in sd */unsigned long total_util;struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */struct sg_lb_stats local_stat; /* Statistics of the local group */
};
sd做初始化:
static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
{/** Skimp(略过) on the clearing(结算) to avoid duplicate(重复的) work. We can avoid clearing* local_stat because update_sg_lb_stats() does a full clear/assignment.* We must however clear busiest_stat::avg_load because* update_sd_pick_busiest() reads this before assignment(分配).*/*sds = (struct sd_lb_stats){.busiest = NULL,.local = NULL,.total_running = 0UL,.total_load = 0UL,.total_capacity = 0UL,.total_util = 0UL,.busiest_stat = {.avg_load = 0UL,.sum_nr_running = 0,.group_type = group_other,},};
}
4、struct sched_group
struct sched_group {
///* Must be a circular list *///环形list sg==sg->next domain内group遍历完成struct sched_group *next; /* Must be a circular list */atomic_t ref;unsigned int group_weight;struct sched_group_capacity *sgc;const struct sched_group_energy *sge;bool overutilized;/** The CPUs this group covers.** NOTE: this field is variable length. (Allocated dynamically* by attaching extra space to the end of the structure,* depending on how many CPUs the kernel has booted up with)*/// 当前group具有哪些cpuunsigned long cpumask[0];
};
5、sg_lb_stats
/** sg_lb_stats - stats of a sched_group required for load_balancing*/
// 在函数update_sg_lb_stats里面更新
struct sg_lb_stats {// sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;// 后续作为判断sg busy的主要依据unsigned long avg_load; /*Avg load across the CPUs of the group */// 每个cpu的load的sumunsigned long group_load; /* Total load over the CPUs of the group *///也是sumunsigned long sum_weighted_load; /* Weighted load of group's tasks *///sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;unsigned long load_per_task;//組中平均每個task的負載
group可容納的task數量,这个不分轻重吗?
//这个存在多个说法// 单个cpu需要考虑rt的影响unsigned long group_capacity; //sg所有cpu capacity的累加// sumunsigned long group_util; /* Total utilization of the group */unsigned int sum_nr_running; /* Nr tasks running in the group */unsigned int idle_cpus;//idle状态的cpu计数// 这个是存疑且不确定的值unsigned int group_weight;// online的cpu的个数
// 严重级别 group_overloaded > group_imbalanced > group_other,后面还多了一个enum group_type group_type;int group_no_capacity;// sgs的capacity已经不够用,赶不上util,所以此时group_overloaded了int group_misfit_task; /* A cpu has a task too big for its capacity */
};
5、struct lb_env
enum fbq_type { regular, remote, all };struct lb_env {struct sched_domain *sd;//所在的sdstruct rq *src_rq;int src_cpu;int dst_cpu;//这里dst_cpu就是需要将task pull到的cpu,目标cpustruct rq *dst_rq;
//由于一些cpu allows的设置,导致一些task不能被迁移到dst_cpu上,
所以在出现这种情况的时候,就需要从dst cpu所在的group上选择另外一个cpustruct cpumask *dst_grpmask;int new_dst_cpu;enum cpu_idle_type idle;//当前cpu是否是idlelong imbalance;//需要迁移的负载,这个是数量还是load值?unsigned int src_grp_nr_running;// 源cpu的task的数量,不一定是最busiest的cpu/* The set of CPUs under consideration for load-balancing */struct cpumask *cpus;unsigned int flags;unsigned int loop;unsigned int loop_break;unsigned int loop_max;//最大迁移的task的数量enum fbq_type fbq_type;enum group_type busiest_group_type;
//初始化链表,后续会将需要迁移的task暂时放在这个链表里面struct list_head tasks;
};
6、struct root_domain
/** We add the notion(概念) of a root-domain which will be used to define per-domain* variables. Each exclusive(单独的) cpuset essentially(本质上) defines an island domain by* fully partitioning(分割的) the member cpus from any other cpuset. Whenever a new* exclusive cpuset is created, we also create and attach a new root-domain* object.**/
struct root_domain {atomic_t refcount;atomic_t rto_count;struct rcu_head rcu;cpumask_var_t span;cpumask_var_t online;/** Indicate pullable load on at least one CPU, e.g:* - More than one runnable task* - Running task is misfit*/int overload;/** The bit corresponding to a CPU gets set here if such CPU has more* than one runnable -deadline task (as it is below for RT tasks).*/cpumask_var_t dlo_mask;atomic_t dlo_count;struct dl_bw dl_bw;struct cpudl cpudl;#ifdef HAVE_RT_PUSH_IPI/** For IPI pull requests, loop across the rto_mask.*/struct irq_work rto_push_work;raw_spinlock_t rto_lock;/* These are only updated and read within rto_lock */int rto_loop;int rto_cpu;/* These atomics are updated outside of a lock */atomic_t rto_loop_next;atomic_t rto_loop_start;
#endif/** The "RT overload" flag: it gets set if a CPU has more than* one runnable RT task.*/cpumask_var_t rto_mask;struct cpupri cpupri;/* Maximum cpu capacity in the system. */struct max_cpu_capacity max_cpu_capacity;/* First cpu with maximum and minimum original capacity */int max_cap_orig_cpu, min_cap_orig_cpu;/* First cpu with middle original capacity */int mid_cap_orig_cpu;
};
6、sched_group_capacity
struct sched_group_capacity {atomic_t ref;/** CPU capacity of this group, SCHED_LOAD_SCALE being max capacity* for a single CPU.*/unsigned long capacity;unsigned long max_capacity; //这里应该是指是一个每cpu变量
/* Max per-cpu capacity in group */unsigned long min_capacity; /* Min per-CPU capacity in group */unsigned long next_update;//下次更新的时间,到了时间会调用update_group_capacity函数// 1表示不平衡,0表示已经平衡或者不能进行均衡吗?int imbalance; /* XXX unrelated to capacity but shared group state *//** Number of busy cpus in this group.*/atomic_t nr_busy_cpus;//进入idle的时候会减少这个值unsigned long cpumask[0]; /* iteration mask */
};
7、struct ravg
/* ravg represents frequency scaled cpu-demand of tasks */
struct ravg {/** 'mark_start' marks the beginning of an event (task waking up, task* starting to execute, task being preempted) within a window** 'sum' represents how runnable a task has been within current* window. It incorporates both running time and wait time and is* frequency scaled.(频率缩放)** 'sum_history' keeps track of history of 'sum' seen over previous* RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are* ignored.** 'demand' represents maximum sum seen over previous* sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency* demand for tasks.** 'curr_window' represents task's contribution to cpu busy time* statistics (rq->curr_runnable_sum) in current window** 'prev_window' represents task's contribution to cpu busy time* statistics (rq->prev_runnable_sum) in previous window*/
/*'mark_start'标志着事件的开始(任务醒来,任务在窗口内开始执行,任务被抢占)*/u64 mark_start;
/*'sum'表示任务在当前窗口内的可运行程度。它结合了运行时间和等待时间,并且是频率缩放的*/
// 0表示不可运行
/*'demand'表示在之前的sysctl_sched_ravg_hist_size窗口中看到的最大总和。 “需求”可以推动任务的频率需求。*/u32 sum, demand;
/*'sum_history'跟踪先前RAVG_HIST_SIZE窗口上看到的'sum'的历史记录。 任务完全休眠的Windows将被忽略。*/u32 sum_history[RAVG_HIST_SIZE_MAX];
/*'curr_window'表示任务对当前窗口中cpu繁忙时间统计信息(rq-> curr_runnable_sum)的贡献*/u32 curr_window, prev_window;u16 active_windows;
};
【数据结构】【cfs_rq】【task_struct】【sched_domain】相关推荐
- PCB task_struct 数据结构 (转:http://blog.csdn.net/jurrah/article/details/3965437)
在linux 中每一个进程都由task_struct 数据结构来定义. task_struct就是我们通常所说的PCB.她是对进程控制的唯一手段也是最有效的手段. 当我们调用fork() 时, ...
- task_struct结构
task_struct结构 分类: linux 在linux 中每一个进程都由task_struct 数据结构来定义. task_struct就是我们通常所说的PCB.她是对进程控制的唯一手段也是最有 ...
- task_struct结构体(PCB)描述
task_struct结构描述 在linux 中每一个进程都由task_struct 数据结构来定义. task_struct就是我们通常所说的PCB.她是对进程控制的唯一手段也是最有效的手段. 当 ...
- 吐血整理 | 肝翻 Linux 进程调度所有知识点|中奖揭晓
前面我们重点分析了如何通过 fork, vfork, pthread_create 去创建一个进程或者线程,以及后面说了它们共同调用 do_fork 的实现.现在已经知道一个进程是如何创建的,但是进程 ...
- 第一次作业:深入Linux源码分析进程模型
一.进程的概念 第一,进程是一个实体.每一个进程都有它自己的地址空间,一般情况下,包括文本区域(text region).数据区域(data region)和堆栈(stack region).文本区域 ...
- (5)Linux进程调度-CFS调度器
目录 背景 1. 概述 2. 数据结构 2.1 调度类 2.2 rq/cfs_rq/task_struct/task_group/sched_entity 3. 流程分析 3.1 runtime与vr ...
- Linux进程调度 - CFS调度器 LoyenWang
背景 Read the fucking source code! --By 鲁迅 A picture is worth a thousand words. --By 高尔基 说明: Kernel版本: ...
- 《Linux内核设计与实现》读书笔记
文章目录 第1章 Linux内核简介 1.3操作系统和内核简介 单内核和微内核 1.5 Linux内核版本 第2章 从内核出发 2.2 内核源码树 2.4 内核开发的特点 第3章 进程管理 3.1 进 ...
- Linux进程调度-CFS调度器原理分析及实现,懂了
1. 概述 (1) Completely Fair Scheduler,完全公平调度器,用于Linux系统中普通进程的调度. (2) CFS采用了红黑树算法来管理所有的调度实体 sched_entit ...
- Linux进程ID号--Linux进程的管理与调度(三)
进程ID概述 进程ID类型 要想了解内核如何来组织和管理进程ID,先要知道进程ID的类型: 内核中进程ID的类型用pid_type来描述,它被定义在include/linux/pid.h中 enum ...
最新文章
- python.day.10——面向对象(二)
- 中过滤记录中时间_除尘滤芯如何在过滤行业中脱颖而出
- ICS汇编学习笔记——8086的指令系统
- 使用next_permutation()的坑,你中招了么?
- bios是固定在微型计算机上的一块RoM,计算机硬件笔试试题
- 车纷享:基于阿里云HBase构建车联网平台实践
- eclipse aop连接点joinpoint方法点不出来_(面试必备)你必须要懂的Spring-Aop
- ModuleNotFoundError: No module named ‘exceptions‘
- ABP(现代ASP.NET样板开发框架)系列之20、ABP展现层——动态生成WebApi
- JSON cannot be resolved 解决方法
- java实现数据库回滚,java 数据库操作,事宜回滚
- 计算机毕业论文画图软件,计算机科学与技术专业毕业论文(绘图软件的设计).doc...
- 无人驾驶系统基本框架
- 5个小众视频素材网站,你知道吗?
- android 8.0 图标规范,Android 8.0自适应图标
- 计算机如何执行(运行)程序
- halcon学习之路
- 地图标注源码 php,百度地图设置坐标,标注坐标地理位置
- Hutool Java 工具类库Excel导出,配置宽度自适应极度舒适
- TOM游戏h5营销案例分析-高空运鸡蛋