struct cfs_rq

在系统中至少有一个CFS运行队列，其就是根CFS运行队列，而其他的进程组和进程都包含在此运行队列中，不同的是进程组又有它自己的CFS运行队列，其运行队列中包含的是此进程组中的所有进程。当调度器从根CFS运行队列中选择了一个进程组进行调度时，进程组会从自己的CFS运行队列中选择一个调度实体进行调度(这个调度实体可能为进程，也可能又是一个子进程组)，就这样一直深入，直到最后选出一个进程进行运行为止。

cfs_rq实际上是rq中与cfs相关的字段

/* CFS-related fields in a runqueue */
struct cfs_rq {
/*
该cfs_rq的load，它只计算它本层下面的se的weight之和，并不是这个se的load，也不是递归到叶子节点上的所有se weight之和（理解这点非常重要）*/struct load_weight load;/*所有进程的累计负荷值*/
//h_nr_running只对于组才有效，包括底层所有cfs_rq的nr_running之和unsigned int nr_running, h_nr_running;nr_running/*当前就绪队列的进程数*/u64 exec_clock;//该cfs_rq总共占用的cpu时间（物理），只累计本层
/** 当前CFS队列上最小运行时间，单调递增* 两种情况下更新该值: * 1、更新当前运行任务的累计运行时间时* 2、当任务从队列删除去，如任务睡眠或退出，这时候会查看剩下的任务的vruntime是否大于min_vruntime，如果是则更新该值。*/
//用于调整se的vruntime，它是递增的，但不一定是该cfs_rq里所有se最小u64 min_vruntime; //该cpu运行队列的vruntime推进值, 一般是红黑树中最小的vruntime值
#ifndef CONFIG_64BITu64 min_vruntime_copy;
#endifstruct rb_root tasks_timeline;/*红黑树的头结点*/struct rb_node *rb_leftmost;/*红黑树的最左面节点*//** 'curr' points to currently running entity on this cfs_rq.* It is set to NULL otherwise (i.e when none are currently running).*/
// current是正在被调用的实体对象
//当前运行的se（对于组虽然它不会在cpu上运行，但是当它的下层有一个task在cpu上运行，那么它所在的cfs_rq就把它当做是该cfs_rq上当前正在运行的se）struct sched_entity *curr, *next, *last, *skip;
/** 'curr' points to currently running entity on this cfs_rq.* It is set to NULL otherwise (i.e when none are currently running).* curr: 当前正在运行的sched_entity（对于组虽然它不会在cpu上运行，但是当它的下层有一个task在cpu上运行，那么它所在的cfs_rq就把它当做是该cfs_rq上当前正在运行的sched_entity）* next: 表示有些进程急需运行，即使不遵从CFS调度也必须运行它，调度时会检查是否next需要调度，有就调度next** skip: 略过进程(不会选择skip指定的进程调度)*/#ifdef CONFIG_SCHED_DEBUGunsigned int nr_spread_over;
#endif#ifdef CONFIG_SMP/** CFS load tracking*/struct sched_avg avg;u64 runnable_load_sum;unsigned long runnable_load_avg;
#ifdef CONFIG_64BIT_ONLY_CPUunsigned long runnable_load_avg_32bit;
#endif
#ifdef CONFIG_FAIR_GROUP_SCHEDunsigned long tg_load_avg_contrib;unsigned long propagate_avg;
#endifatomic_long_t removed_load_avg, removed_util_avg;
#ifndef CONFIG_64BITu64 load_last_update_time_copy;
#endif#ifdef CONFIG_FAIR_GROUP_SCHED/**   h_load = weight * f(tg)** Where f(tg) is the recursive weight fraction assigned to* this group.*/unsigned long h_load;u64 last_h_load_update;struct sched_entity *h_load_next;
#endif /* CONFIG_FAIR_GROUP_SCHED */
#endif /* CONFIG_SMP */#ifdef CONFIG_FAIR_GROUP_SCHED/* 所属于的CPU rq */struct rq *rq; /* cpu runqueue to which this cfs_rq is attached *//** leaf cfs_rqs are those that hold tasks (lowest schedulable entity in* a hierarchy). Non-leaf lrqs hold other higher schedulable entities* (like users, containers etc.)** leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This* list is used during load balance.*/int on_list;struct list_head leaf_cfs_rq_list;
/*属于这个cfs_rq的进程组*/ struct task_group *tg;   /* group that "owns" this runqueue */#ifdef CONFIG_SCHED_WALTu64 cumulative_runnable_avg;
#endif#ifdef CONFIG_CFS_BANDWIDTHint runtime_enabled;u64 runtime_expires;s64 runtime_remaining;u64 throttled_clock, throttled_clock_task;u64 throttled_clock_task_time;int throttled, throttle_count, throttle_uptodate;struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};

task_struct

每个task对应一个se，但是反过去不一定成立，因为有task_group的概念

struct task_struct
{......../* 表示是否在运行队列 */int on_rq;/* 进程优先级 * prio: 动态优先级，范围为100~139，与静态优先级和补偿(bonus)有关* static_prio: 静态优先级，static_prio = 100 + nice + 20 (nice值为-20~19,所以static_prio值为100~139)* normal_prio: 没有受优先级继承影响的常规优先级，具体见normal_prio函数，跟属于什么类型的进程有关*/int prio, static_prio, normal_prio;/* 实时进程优先级 */unsigned int rt_priority;/* 调度类，调度处理函数类 */const struct sched_class *sched_class;/* 调度实体(红黑树的一个结点) */struct sched_entity se; //通过这个调度实体可以找到对应的task/* 调度实体(实时调度使用) */struct sched_rt_entity rt;struct sched_dl_entity dl;#ifdef CONFIG_CGROUP_SCHED/* 指向其所在进程组 */struct task_group *sched_task_group;
#endif........
}

struct sched_domain

struct sched_domain {/* These fields must be setup */
//调用域可以被别的调用域所包含，parent指向父调用域 struct sched_domain *parent;   /* top domain must be null terminated（终止） */struct sched_domain *child;   /* bottom domain must be null terminated */// 指向正在均衡的groupstruct sched_group *groups;   /* the balancing groups of the domain */
//最小的时间间隔，用于检查进行负载均衡操作的时机是否到了 unsigned long min_interval;    /* Minimum balance interval ms */unsigned long max_interval;    /* Maximum balance interval ms *///当处理器在不空闲的状态下时，进行负载均衡操作的时间间隔一般也长很多，该factor为其乘数银子 unsigned int busy_factor;  /* less balancing by factor if busy *///busy时平衡因子
//判断该调度域是否已经均衡的一个基准值unsigned int imbalance_pct; /* No balance until over watermark（水印） */unsigned int cache_nice_tries;   /* Leave cache hot tasks for # tries */unsigned int busy_idx;//忙均衡的cpu_load索引unsigned int idle_idx;//空闲均衡的cpu_load索引
//马上就要进入idle的cpu为了尽量不进入idle而进行负载均衡时的cpu_load索引 unsigned int newidle_idx;unsigned int wake_idx;unsigned int forkexec_idx;unsigned int smt_gain;
//进入nohz_idle模式的时候该值为1int nohz_idle;            /* NOHZ IDLE status */int flags;            /* See SD_* */int level;          //domain 所处层次级别/* Runtime fields. */
//domain上次做balance时间unsigned long last_balance; /* init to jiffies. units in jiffies */
//每次balance的间隔时间unsigned int balance_interval;  /* initialise to 1. units in ms. *///balance失败次数unsigned int nr_balance_failed; /* initialise to 0 *//* idle_balance() stats */
//这里的max_newidle_lb_cost是指做load balance所花时间。如上面注释所说，max_newidle_lb_cost每个1s衰减1%u64 max_newidle_lb_cost;
//是下一次进行衰减的时间，HZ为jiffies的1s时间unsigned long next_decay_max_lb_cost;#ifdef CONFIG_SCHEDSTATS/* load_balance() stats */unsigned int lb_count[CPU_MAX_IDLE_TYPES];unsigned int lb_failed[CPU_MAX_IDLE_TYPES];unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];unsigned int lb_gained[CPU_MAX_IDLE_TYPES];unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];/* Active load balancing */unsigned int alb_count;unsigned int alb_failed;unsigned int alb_pushed;/* SD_BALANCE_EXEC stats */unsigned int sbe_count;unsigned int sbe_balanced;unsigned int sbe_pushed;/* SD_BALANCE_FORK stats */unsigned int sbf_count;unsigned int sbf_balanced;unsigned int sbf_pushed;/* try_to_wake_up() stats */unsigned int ttwu_wake_remote;unsigned int ttwu_move_affine;unsigned int ttwu_move_balance;struct eas_stats eas_stats;
#endif
#ifdef CONFIG_SCHED_DEBUGchar *name;
#endifunion {void *private;     /* used during construction */struct rcu_head rcu;  /* used during destruction */};#ifdef CONFIG_INTEL_DWSunsigned int total_groups;        /* total group number */unsigned int group_number;      /* this CPU's group sequence */unsigned int dws_tf;            /* consolidating degree */struct sched_group *first_group;  /* ordered by CPU number */
#endifunsigned int span_weight;/** Span of all CPUs in this domain.** NOTE: this field is variable length. (Allocated dynamically* by attaching extra space to the end of the structure,* depending on how many CPUs the kernel has booted up with)*/unsigned long span[0];//当前 domain 中的所有 cpu 位图
};

4、struct sd_lb_stats


/** sd_lb_stats - Structure to store the statistics of a sched_domain*       during load balancing.*/
struct sd_lb_stats {struct sched_group *busiest;    /* Busiest group in this sd */struct sched_group *local;    /* Local group in this sd */unsigned long total_load;   /* Total load of all groups in sd */unsigned long total_capacity;   /* Total capacity of all groups in sd *////domain中各個group的平均負載unsigned long avg_load;   /* Average load across all groups in sd */unsigned long total_util;struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */struct sg_lb_stats local_stat;  /* Statistics of the local group */
};

sd做初始化：

static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
{/** Skimp(略过) on the clearing(结算) to avoid duplicate(重复的) work. We can avoid clearing* local_stat because update_sg_lb_stats() does a full clear/assignment.* We must however clear busiest_stat::avg_load because* update_sd_pick_busiest() reads this before assignment(分配).*/*sds = (struct sd_lb_stats){.busiest = NULL,.local = NULL,.total_running = 0UL,.total_load = 0UL,.total_capacity = 0UL,.total_util = 0UL,.busiest_stat = {.avg_load = 0UL,.sum_nr_running = 0,.group_type = group_other,},};
}

4、struct sched_group

struct sched_group {
///* Must be a circular list *///环形list sg==sg->next  domain内group遍历完成struct sched_group *next;       /* Must be a circular list */atomic_t ref;unsigned int group_weight;struct sched_group_capacity *sgc;const struct sched_group_energy *sge;bool overutilized;/** The CPUs this group covers.** NOTE: this field is variable length. (Allocated dynamically* by attaching extra space to the end of the structure,* depending on how many CPUs the kernel has booted up with)*/// 当前group具有哪些cpuunsigned long cpumask[0];
};

5、sg_lb_stats


/** sg_lb_stats - stats of a sched_group required for load_balancing*/
// 在函数update_sg_lb_stats里面更新
struct sg_lb_stats {// sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;// 后续作为判断sg busy的主要依据unsigned long avg_load; /*Avg load across the CPUs of the group */// 每个cpu的load的sumunsigned long group_load; /* Total load over the CPUs of the group *///也是sumunsigned long sum_weighted_load; /* Weighted load of group's tasks *///sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;unsigned long load_per_task;//組中平均每個task的負載
group可容納的task數量，这个不分轻重吗？
//这个存在多个说法// 单个cpu需要考虑rt的影响unsigned long group_capacity; //sg所有cpu capacity的累加// sumunsigned long group_util; /* Total utilization of the group */unsigned int sum_nr_running; /* Nr tasks running in the group */unsigned int idle_cpus;//idle状态的cpu计数// 这个是存疑且不确定的值unsigned int group_weight;// online的cpu的个数
// 严重级别 group_overloaded > group_imbalanced > group_other，后面还多了一个enum group_type group_type;int group_no_capacity;// sgs的capacity已经不够用，赶不上util，所以此时group_overloaded了int group_misfit_task; /* A cpu has a task too big for its capacity */
};

5、struct lb_env

enum fbq_type { regular, remote, all };struct lb_env {struct sched_domain   *sd;//所在的sdstruct rq        *src_rq;int         src_cpu;int         dst_cpu;//这里dst_cpu就是需要将task pull到的cpu,目标cpustruct rq       *dst_rq;
//由于一些cpu allows的设置，导致一些task不能被迁移到dst_cpu上，
所以在出现这种情况的时候，就需要从dst cpu所在的group上选择另外一个cpustruct cpumask     *dst_grpmask;int            new_dst_cpu;enum cpu_idle_type  idle;//当前cpu是否是idlelong         imbalance;//需要迁移的负载，这个是数量还是load值？unsigned int     src_grp_nr_running;// 源cpu的task的数量，不一定是最busiest的cpu/* The set of CPUs under consideration for load-balancing */struct cpumask        *cpus;unsigned int      flags;unsigned int      loop;unsigned int       loop_break;unsigned int     loop_max;//最大迁移的task的数量enum fbq_type        fbq_type;enum group_type        busiest_group_type;
//初始化链表，后续会将需要迁移的task暂时放在这个链表里面struct list_head  tasks;
};

6、struct root_domain

/** We add the notion(概念) of a root-domain which will be used to define per-domain* variables. Each exclusive(单独的) cpuset essentially(本质上) defines an island domain by* fully partitioning(分割的) the member cpus from any other cpuset. Whenever a new* exclusive cpuset is created, we also create and attach a new root-domain* object.**/
struct root_domain {atomic_t refcount;atomic_t rto_count;struct rcu_head rcu;cpumask_var_t span;cpumask_var_t online;/** Indicate pullable load on at least one CPU, e.g:* - More than one runnable task* - Running task is misfit*/int overload;/** The bit corresponding to a CPU gets set here if such CPU has more* than one runnable -deadline task (as it is below for RT tasks).*/cpumask_var_t dlo_mask;atomic_t dlo_count;struct dl_bw dl_bw;struct cpudl cpudl;#ifdef HAVE_RT_PUSH_IPI/** For IPI pull requests, loop across the rto_mask.*/struct irq_work rto_push_work;raw_spinlock_t rto_lock;/* These are only updated and read within rto_lock */int rto_loop;int rto_cpu;/* These atomics are updated outside of a lock */atomic_t rto_loop_next;atomic_t rto_loop_start;
#endif/** The "RT overload" flag: it gets set if a CPU has more than* one runnable RT task.*/cpumask_var_t rto_mask;struct cpupri cpupri;/* Maximum cpu capacity in the system. */struct max_cpu_capacity max_cpu_capacity;/* First cpu with maximum and minimum original capacity */int max_cap_orig_cpu, min_cap_orig_cpu;/* First cpu with middle original capacity */int mid_cap_orig_cpu;
};

6、sched_group_capacity


struct sched_group_capacity {atomic_t ref;/** CPU capacity of this group, SCHED_LOAD_SCALE being max capacity* for a single CPU.*/unsigned long capacity;unsigned long max_capacity; //这里应该是指是一个每cpu变量
/* Max per-cpu capacity in group */unsigned long min_capacity; /* Min per-CPU capacity in group */unsigned long next_update;//下次更新的时间，到了时间会调用update_group_capacity函数// 1表示不平衡，0表示已经平衡或者不能进行均衡吗？int imbalance; /* XXX unrelated to capacity but shared group state *//** Number of busy cpus in this group.*/atomic_t nr_busy_cpus;//进入idle的时候会减少这个值unsigned long cpumask[0]; /* iteration mask */
};

7、struct ravg

/* ravg represents frequency scaled cpu-demand of tasks */
struct ravg {/** 'mark_start' marks the beginning of an event (task waking up, task* starting to execute, task being preempted) within a window** 'sum' represents how runnable a task has been within current* window. It incorporates both running time and wait time and is* frequency scaled.（频率缩放）** 'sum_history' keeps track of history of 'sum' seen over previous* RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are* ignored.** 'demand' represents maximum sum seen over previous* sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency* demand for tasks.** 'curr_window' represents task's contribution to cpu busy time* statistics (rq->curr_runnable_sum) in current window** 'prev_window' represents task's contribution to cpu busy time* statistics (rq->prev_runnable_sum) in previous window*/
/*'mark_start'标志着事件的开始（任务醒来，任务在窗口内开始执行，任务被抢占）*/u64 mark_start;
/*'sum'表示任务在当前窗口内的可运行程度。它结合了运行时间和等待时间，并且是频率缩放的*/
// 0表示不可运行
/*'demand'表示在之前的sysctl_sched_ravg_hist_size窗口中看到的最大总和。 “需求”可以推动任务的频率需求。*/u32 sum, demand;
/*'sum_history'跟踪先前RAVG_HIST_SIZE窗口上看到的'sum'的历史记录。 任务完全休眠的Windows将被忽略。*/u32 sum_history[RAVG_HIST_SIZE_MAX];
/*'curr_window'表示任务对当前窗口中cpu繁忙时间统计信息（rq-> curr_runnable_sum）的贡献*/u32 curr_window, prev_window;u16 active_windows;
};

【数据结构】【cfs_rq】【task_struct】【sched_domain】相关推荐

PCB task_struct 数据结构 (转：http://blog.csdn.net/jurrah/article/details/3965437)
在linux 中每一个进程都由task_struct 数据结构来定义. task_struct就是我们通常所说的PCB.她是对进程控制的唯一手段也是最有效的手段. 当我们调用fork() 时, ...
task_struct结构
task_struct结构分类: linux 在linux 中每一个进程都由task_struct 数据结构来定义. task_struct就是我们通常所说的PCB.她是对进程控制的唯一手段也是最有 ...
task_struct结构体（PCB）描述
task_struct结构描述在linux 中每一个进程都由task_struct 数据结构来定义. task_struct就是我们通常所说的PCB.她是对进程控制的唯一手段也是最有效的手段. 当 ...
吐血整理 | 肝翻 Linux 进程调度所有知识点|中奖揭晓
前面我们重点分析了如何通过 fork, vfork, pthread_create 去创建一个进程或者线程,以及后面说了它们共同调用 do_fork 的实现.现在已经知道一个进程是如何创建的,但是进程 ...
第一次作业：深入Linux源码分析进程模型
一.进程的概念第一,进程是一个实体.每一个进程都有它自己的地址空间,一般情况下,包括文本区域(text region).数据区域(data region)和堆栈(stack region).文本区域 ...
（5）Linux进程调度-CFS调度器
目录背景 1. 概述 2. 数据结构 2.1 调度类 2.2 rq/cfs_rq/task_struct/task_group/sched_entity 3. 流程分析 3.1 runtime与vr ...
Linux进程调度 - CFS调度器 LoyenWang
背景 Read the fucking source code! --By 鲁迅 A picture is worth a thousand words. --By 高尔基说明: Kernel版本: ...
《Linux内核设计与实现》读书笔记
文章目录第1章 Linux内核简介 1.3操作系统和内核简介单内核和微内核 1.5 Linux内核版本第2章从内核出发 2.2 内核源码树 2.4 内核开发的特点第3章进程管理 3.1 进 ...
Linux进程调度-CFS调度器原理分析及实现，懂了
1. 概述 (1) Completely Fair Scheduler,完全公平调度器,用于Linux系统中普通进程的调度. (2) CFS采用了红黑树算法来管理所有的调度实体 sched_entit ...
Linux进程ID号--Linux进程的管理与调度（三）
进程ID概述进程ID类型要想了解内核如何来组织和管理进程ID,先要知道进程ID的类型: 内核中进程ID的类型用pid_type来描述,它被定义在include/linux/pid.h中 enum ...

【数据结构】【cfs_rq】【task_struct】【sched_domain】

struct cfs_rq

task_struct

struct sched_domain

4、struct sd_lb_stats

sd做初始化：

4、struct sched_group

5、sg_lb_stats

5、struct lb_env

6、struct root_domain

6、sched_group_capacity

7、struct ravg

【数据结构】【cfs_rq】【task_struct】【sched_domain】相关推荐

最新文章

热门文章

【数据结构】【cfs_rq】【task_struct】【sched_domain】

struct cfs_rq

task_struct

struct sched_domain

4、struct sd_lb_stats

sd做初始化：

4、​struct sched_group

5、sg_lb_stats

5、struct lb_env

6、struct root_domain

6、sched_group_capacity

7、struct ravg

【数据结构】【cfs_rq】【task_struct】【sched_domain】相关推荐

最新文章

热门文章

4、struct sched_group