struct cfs_rq

在系统中至少有一个CFS运行队列,其就是根CFS运行队列,而其他的进程组 和进程都包含在此运行队列中,不同的是进程组又有它自己的CFS运行队列,其运行队列中包含的 是此进程组中的所有进程。当调度器从根CFS运行队列中选择了一个进程组进行调度时,进程组会 从自己的CFS运行队列中选择一个调度实体进行调度(这个调度实体可能为进程,也可能又是一个 子进程组),就这样一直深入,直到最后选出一个进程进行运行为止。

cfs_rq实际上是rq中与cfs相关的字段

/* CFS-related fields in a runqueue */
struct cfs_rq {
/*
该cfs_rq的load,它只计算它本层下面的se的weight之和,并不是这个se的load,也不是递归到叶子节点上的所有se weight之和(理解这点非常重要)*/struct load_weight load;/*所有进程的累计负荷值*/
//h_nr_running只对于组才有效,包括底层所有cfs_rq的nr_running之和unsigned int nr_running, h_nr_running;nr_running/*当前就绪队列的进程数*/u64 exec_clock;//该cfs_rq总共占用的cpu时间(物理),只累计本层
/** 当前CFS队列上最小运行时间,单调递增* 两种情况下更新该值: * 1、更新当前运行任务的累计运行时间时* 2、当任务从队列删除去,如任务睡眠或退出,这时候会查看剩下的任务的vruntime是否大于min_vruntime,如果是则更新该值。*/
//用于调整se的vruntime,它是递增的,但不一定是该cfs_rq里所有se最小u64 min_vruntime; //该cpu运行队列的vruntime推进值, 一般是红黑树中最小的vruntime值
#ifndef CONFIG_64BITu64 min_vruntime_copy;
#endifstruct rb_root tasks_timeline;/*红黑树的头结点*/struct rb_node *rb_leftmost;/*红黑树的最左面节点*//** 'curr' points to currently running entity on this cfs_rq.* It is set to NULL otherwise (i.e when none are currently running).*/
// current是正在被调用的实体对象
//当前运行的se(对于组虽然它不会在cpu上运行,但是当它的下层有一个task在cpu上运行,那么它所在的cfs_rq就把它当做是该cfs_rq上当前正在运行的se)struct sched_entity *curr, *next, *last, *skip;
/** 'curr' points to currently running entity on this cfs_rq.* It is set to NULL otherwise (i.e when none are currently running).* curr: 当前正在运行的sched_entity(对于组虽然它不会在cpu上运行,但是当它的下层有一个task在cpu上运行,那么它所在的cfs_rq就把它当做是该cfs_rq上当前正在运行的sched_entity)* next: 表示有些进程急需运行,即使不遵从CFS调度也必须运行它,调度时会检查是否next需要调度,有就调度next** skip: 略过进程(不会选择skip指定的进程调度)*/#ifdef CONFIG_SCHED_DEBUGunsigned int nr_spread_over;
#endif#ifdef CONFIG_SMP/** CFS load tracking*/struct sched_avg avg;u64 runnable_load_sum;unsigned long runnable_load_avg;
#ifdef CONFIG_64BIT_ONLY_CPUunsigned long runnable_load_avg_32bit;
#endif
#ifdef CONFIG_FAIR_GROUP_SCHEDunsigned long tg_load_avg_contrib;unsigned long propagate_avg;
#endifatomic_long_t removed_load_avg, removed_util_avg;
#ifndef CONFIG_64BITu64 load_last_update_time_copy;
#endif#ifdef CONFIG_FAIR_GROUP_SCHED/**   h_load = weight * f(tg)** Where f(tg) is the recursive weight fraction assigned to* this group.*/unsigned long h_load;u64 last_h_load_update;struct sched_entity *h_load_next;
#endif /* CONFIG_FAIR_GROUP_SCHED */
#endif /* CONFIG_SMP */#ifdef CONFIG_FAIR_GROUP_SCHED/* 所属于的CPU rq */struct rq *rq; /* cpu runqueue to which this cfs_rq is attached *//** leaf cfs_rqs are those that hold tasks (lowest schedulable entity in* a hierarchy). Non-leaf lrqs hold other higher schedulable entities* (like users, containers etc.)** leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This* list is used during load balance.*/int on_list;struct list_head leaf_cfs_rq_list;
/*属于这个cfs_rq的进程组*/ struct task_group *tg;   /* group that "owns" this runqueue */#ifdef CONFIG_SCHED_WALTu64 cumulative_runnable_avg;
#endif#ifdef CONFIG_CFS_BANDWIDTHint runtime_enabled;u64 runtime_expires;s64 runtime_remaining;u64 throttled_clock, throttled_clock_task;u64 throttled_clock_task_time;int throttled, throttle_count, throttle_uptodate;struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};

task_struct

每个task对应一个se,但是反过去不一定成立,因为有task_group的概念

struct task_struct
{......../* 表示是否在运行队列 */int on_rq;/* 进程优先级 * prio: 动态优先级,范围为100~139,与静态优先级和补偿(bonus)有关* static_prio: 静态优先级,static_prio = 100 + nice + 20 (nice值为-20~19,所以static_prio值为100~139)* normal_prio: 没有受优先级继承影响的常规优先级,具体见normal_prio函数,跟属于什么类型的进程有关*/int prio, static_prio, normal_prio;/* 实时进程优先级 */unsigned int rt_priority;/* 调度类,调度处理函数类 */const struct sched_class *sched_class;/* 调度实体(红黑树的一个结点) */struct sched_entity se; //通过这个调度实体可以找到对应的task/* 调度实体(实时调度使用) */struct sched_rt_entity rt;struct sched_dl_entity dl;#ifdef CONFIG_CGROUP_SCHED/* 指向其所在进程组 */struct task_group *sched_task_group;
#endif........
}

struct sched_domain

struct sched_domain {/* These fields must be setup */
//调用域可以被别的调用域所包含,parent指向父调用域 struct sched_domain *parent;   /* top domain must be null terminated(终止) */struct sched_domain *child;   /* bottom domain must be null terminated */// 指向正在均衡的groupstruct sched_group *groups;   /* the balancing groups of the domain */
//最小的时间间隔,用于检查进行负载均衡操作的时机是否到了 unsigned long min_interval;    /* Minimum balance interval ms */unsigned long max_interval;    /* Maximum balance interval ms *///当处理器在不空闲的状态下时,进行负载均衡操作的时间间隔一般也长很多,该factor为其乘数银子 unsigned int busy_factor;  /* less balancing by factor if busy *///busy时平衡因子
//判断该调度域是否已经均衡的一个基准值unsigned int imbalance_pct; /* No balance until over watermark(水印) */unsigned int cache_nice_tries;   /* Leave cache hot tasks for # tries */unsigned int busy_idx;//忙均衡的cpu_load索引unsigned int idle_idx;//空闲均衡的cpu_load索引
//马上就要进入idle的cpu为了尽量不进入idle而进行负载均衡时的cpu_load索引 unsigned int newidle_idx;unsigned int wake_idx;unsigned int forkexec_idx;unsigned int smt_gain;
//进入nohz_idle模式的时候该值为1int nohz_idle;            /* NOHZ IDLE status */int flags;            /* See SD_* */int level;          //domain 所处层次级别/* Runtime fields. */
//domain上次做balance时间unsigned long last_balance; /* init to jiffies. units in jiffies */
//每次balance的间隔时间unsigned int balance_interval;  /* initialise to 1. units in ms. *///balance失败次数unsigned int nr_balance_failed; /* initialise to 0 *//* idle_balance() stats */
//这里的max_newidle_lb_cost是指做load balance所花时间。如上面注释所说,max_newidle_lb_cost每个1s衰减1%u64 max_newidle_lb_cost;
//是下一次进行衰减的时间,HZ为jiffies的1s时间unsigned long next_decay_max_lb_cost;#ifdef CONFIG_SCHEDSTATS/* load_balance() stats */unsigned int lb_count[CPU_MAX_IDLE_TYPES];unsigned int lb_failed[CPU_MAX_IDLE_TYPES];unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];unsigned int lb_gained[CPU_MAX_IDLE_TYPES];unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];/* Active load balancing */unsigned int alb_count;unsigned int alb_failed;unsigned int alb_pushed;/* SD_BALANCE_EXEC stats */unsigned int sbe_count;unsigned int sbe_balanced;unsigned int sbe_pushed;/* SD_BALANCE_FORK stats */unsigned int sbf_count;unsigned int sbf_balanced;unsigned int sbf_pushed;/* try_to_wake_up() stats */unsigned int ttwu_wake_remote;unsigned int ttwu_move_affine;unsigned int ttwu_move_balance;struct eas_stats eas_stats;
#endif
#ifdef CONFIG_SCHED_DEBUGchar *name;
#endifunion {void *private;     /* used during construction */struct rcu_head rcu;  /* used during destruction */};#ifdef CONFIG_INTEL_DWSunsigned int total_groups;        /* total group number */unsigned int group_number;      /* this CPU's group sequence */unsigned int dws_tf;            /* consolidating degree */struct sched_group *first_group;  /* ordered by CPU number */
#endifunsigned int span_weight;/** Span of all CPUs in this domain.** NOTE: this field is variable length. (Allocated dynamically* by attaching extra space to the end of the structure,* depending on how many CPUs the kernel has booted up with)*/unsigned long span[0];//当前 domain 中的所有 cpu 位图
};

4、struct sd_lb_stats

​
/** sd_lb_stats - Structure to store the statistics of a sched_domain*       during load balancing.*/
struct sd_lb_stats {struct sched_group *busiest;    /* Busiest group in this sd */struct sched_group *local;    /* Local group in this sd */unsigned long total_load;   /* Total load of all groups in sd */unsigned long total_capacity;   /* Total capacity of all groups in sd *////domain中各個group的平均負載unsigned long avg_load;   /* Average load across all groups in sd */unsigned long total_util;struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */struct sg_lb_stats local_stat;  /* Statistics of the local group */
};​

sd做初始化:

static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
{/** Skimp(略过) on the clearing(结算) to avoid duplicate(重复的) work. We can avoid clearing* local_stat because update_sg_lb_stats() does a full clear/assignment.* We must however clear busiest_stat::avg_load because* update_sd_pick_busiest() reads this before assignment(分配).*/*sds = (struct sd_lb_stats){.busiest = NULL,.local = NULL,.total_running = 0UL,.total_load = 0UL,.total_capacity = 0UL,.total_util = 0UL,.busiest_stat = {.avg_load = 0UL,.sum_nr_running = 0,.group_type = group_other,},};
}

4、​struct sched_group

​struct sched_group {
///* Must be a circular list *///环形list sg==sg->next  domain内group遍历完成struct sched_group *next;       /* Must be a circular list */atomic_t ref;unsigned int group_weight;struct sched_group_capacity *sgc;const struct sched_group_energy *sge;bool overutilized;/** The CPUs this group covers.** NOTE: this field is variable length. (Allocated dynamically* by attaching extra space to the end of the structure,* depending on how many CPUs the kernel has booted up with)*/// 当前group具有哪些cpuunsigned long cpumask[0];
};

5、sg_lb_stats


/** sg_lb_stats - stats of a sched_group required for load_balancing*/
// 在函数update_sg_lb_stats里面更新
struct sg_lb_stats {// sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;// 后续作为判断sg busy的主要依据unsigned long avg_load; /*Avg load across the CPUs of the group */// 每个cpu的load的sumunsigned long group_load; /* Total load over the CPUs of the group *///也是sumunsigned long sum_weighted_load; /* Weighted load of group's tasks *///sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;unsigned long load_per_task;//組中平均每個task的負載
group可容納的task數量,这个不分轻重吗?
//这个存在多个说法// 单个cpu需要考虑rt的影响unsigned long group_capacity; //sg所有cpu capacity的累加// sumunsigned long group_util; /* Total utilization of the group */unsigned int sum_nr_running; /* Nr tasks running in the group */unsigned int idle_cpus;//idle状态的cpu计数// 这个是存疑且不确定的值unsigned int group_weight;// online的cpu的个数
// 严重级别 group_overloaded > group_imbalanced > group_other,后面还多了一个enum group_type group_type;int group_no_capacity;// sgs的capacity已经不够用,赶不上util,所以此时group_overloaded了int group_misfit_task; /* A cpu has a task too big for its capacity */
};

5、struct lb_env

​enum fbq_type { regular, remote, all };struct lb_env {struct sched_domain   *sd;//所在的sdstruct rq        *src_rq;int         src_cpu;int         dst_cpu;//这里dst_cpu就是需要将task pull到的cpu,目标cpustruct rq       *dst_rq;
//由于一些cpu allows的设置,导致一些task不能被迁移到dst_cpu上,
所以在出现这种情况的时候,就需要从dst cpu所在的group上选择另外一个cpustruct cpumask     *dst_grpmask;int            new_dst_cpu;enum cpu_idle_type  idle;//当前cpu是否是idlelong         imbalance;//需要迁移的负载,这个是数量还是load值?unsigned int     src_grp_nr_running;// 源cpu的task的数量,不一定是最busiest的cpu/* The set of CPUs under consideration for load-balancing */struct cpumask        *cpus;unsigned int      flags;unsigned int      loop;unsigned int       loop_break;unsigned int     loop_max;//最大迁移的task的数量enum fbq_type        fbq_type;enum group_type        busiest_group_type;
//初始化链表,后续会将需要迁移的task暂时放在这个链表里面struct list_head  tasks;
};

6、struct root_domain

/** We add the notion(概念) of a root-domain which will be used to define per-domain* variables. Each exclusive(单独的) cpuset essentially(本质上) defines an island domain by* fully partitioning(分割的) the member cpus from any other cpuset. Whenever a new* exclusive cpuset is created, we also create and attach a new root-domain* object.**/
struct root_domain {atomic_t refcount;atomic_t rto_count;struct rcu_head rcu;cpumask_var_t span;cpumask_var_t online;/** Indicate pullable load on at least one CPU, e.g:* - More than one runnable task* - Running task is misfit*/int overload;/** The bit corresponding to a CPU gets set here if such CPU has more* than one runnable -deadline task (as it is below for RT tasks).*/cpumask_var_t dlo_mask;atomic_t dlo_count;struct dl_bw dl_bw;struct cpudl cpudl;#ifdef HAVE_RT_PUSH_IPI/** For IPI pull requests, loop across the rto_mask.*/struct irq_work rto_push_work;raw_spinlock_t rto_lock;/* These are only updated and read within rto_lock */int rto_loop;int rto_cpu;/* These atomics are updated outside of a lock */atomic_t rto_loop_next;atomic_t rto_loop_start;
#endif/** The "RT overload" flag: it gets set if a CPU has more than* one runnable RT task.*/cpumask_var_t rto_mask;struct cpupri cpupri;/* Maximum cpu capacity in the system. */struct max_cpu_capacity max_cpu_capacity;/* First cpu with maximum and minimum original capacity */int max_cap_orig_cpu, min_cap_orig_cpu;/* First cpu with middle original capacity */int mid_cap_orig_cpu;
};

6、sched_group_capacity


struct sched_group_capacity {atomic_t ref;/** CPU capacity of this group, SCHED_LOAD_SCALE being max capacity* for a single CPU.*/unsigned long capacity;unsigned long max_capacity; //这里应该是指是一个每cpu变量
/* Max per-cpu capacity in group */unsigned long min_capacity; /* Min per-CPU capacity in group */unsigned long next_update;//下次更新的时间,到了时间会调用update_group_capacity函数// 1表示不平衡,0表示已经平衡或者不能进行均衡吗?int imbalance; /* XXX unrelated to capacity but shared group state *//** Number of busy cpus in this group.*/atomic_t nr_busy_cpus;//进入idle的时候会减少这个值unsigned long cpumask[0]; /* iteration mask */
};

7、struct ravg

/* ravg represents frequency scaled cpu-demand of tasks */
struct ravg {/** 'mark_start' marks the beginning of an event (task waking up, task* starting to execute, task being preempted) within a window** 'sum' represents how runnable a task has been within current* window. It incorporates both running time and wait time and is* frequency scaled.(频率缩放)** 'sum_history' keeps track of history of 'sum' seen over previous* RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are* ignored.** 'demand' represents maximum sum seen over previous* sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency* demand for tasks.** 'curr_window' represents task's contribution to cpu busy time* statistics (rq->curr_runnable_sum) in current window** 'prev_window' represents task's contribution to cpu busy time* statistics (rq->prev_runnable_sum) in previous window*/
/*'mark_start'标志着事件的开始(任务醒来,任务在窗口内开始执行,任务被抢占)*/u64 mark_start;
/*'sum'表示任务在当前窗口内的可运行程度。它结合了运行时间和等待时间,并且是频率缩放的*/
// 0表示不可运行
/*'demand'表示在之前的sysctl_sched_ravg_hist_size窗口中看到的最大总和。 “需求”可以推动任务的频率需求。*/u32 sum, demand;
/*'sum_history'跟踪先前RAVG_HIST_SIZE窗口上看到的'sum'的历史记录。 任务完全休眠的Windows将被忽略。*/u32 sum_history[RAVG_HIST_SIZE_MAX];
/*'curr_window'表示任务对当前窗口中cpu繁忙时间统计信息(rq-> curr_runnable_sum)的贡献*/u32 curr_window, prev_window;u16 active_windows;
};

【数据结构】【cfs_rq】【task_struct】【sched_domain】相关推荐

  1. PCB task_struct 数据结构 (转:http://blog.csdn.net/jurrah/article/details/3965437)

     在linux 中每一个进程都由task_struct 数据结构来定义. task_struct就是我们通常所说的PCB.她是对进程控制的唯一手段也是最有效的手段. 当我们调用fork() 时, ...

  2. task_struct结构

    task_struct结构 分类: linux 在linux 中每一个进程都由task_struct 数据结构来定义. task_struct就是我们通常所说的PCB.她是对进程控制的唯一手段也是最有 ...

  3. task_struct结构体(PCB)描述

     task_struct结构描述 在linux 中每一个进程都由task_struct 数据结构来定义. task_struct就是我们通常所说的PCB.她是对进程控制的唯一手段也是最有效的手段. 当 ...

  4. 吐血整理 | 肝翻 Linux 进程调度所有知识点|中奖揭晓

    前面我们重点分析了如何通过 fork, vfork, pthread_create 去创建一个进程或者线程,以及后面说了它们共同调用 do_fork 的实现.现在已经知道一个进程是如何创建的,但是进程 ...

  5. 第一次作业:深入Linux源码分析进程模型

    一.进程的概念 第一,进程是一个实体.每一个进程都有它自己的地址空间,一般情况下,包括文本区域(text region).数据区域(data region)和堆栈(stack region).文本区域 ...

  6. (5)Linux进程调度-CFS调度器

    目录 背景 1. 概述 2. 数据结构 2.1 调度类 2.2 rq/cfs_rq/task_struct/task_group/sched_entity 3. 流程分析 3.1 runtime与vr ...

  7. Linux进程调度 - CFS调度器 LoyenWang

    背景 Read the fucking source code! --By 鲁迅 A picture is worth a thousand words. --By 高尔基 说明: Kernel版本: ...

  8. 《Linux内核设计与实现》读书笔记

    文章目录 第1章 Linux内核简介 1.3操作系统和内核简介 单内核和微内核 1.5 Linux内核版本 第2章 从内核出发 2.2 内核源码树 2.4 内核开发的特点 第3章 进程管理 3.1 进 ...

  9. Linux进程调度-CFS调度器原理分析及实现,懂了

    1. 概述 (1) Completely Fair Scheduler,完全公平调度器,用于Linux系统中普通进程的调度. (2) CFS采用了红黑树算法来管理所有的调度实体 sched_entit ...

  10. Linux进程ID号--Linux进程的管理与调度(三)

    进程ID概述 进程ID类型 要想了解内核如何来组织和管理进程ID,先要知道进程ID的类型: 内核中进程ID的类型用pid_type来描述,它被定义在include/linux/pid.h中 enum ...

最新文章

  1. python.day.10——面向对象(二)
  2. 中过滤记录中时间_除尘滤芯如何在过滤行业中脱颖而出
  3. ICS汇编学习笔记——8086的指令系统
  4. 使用next_permutation()的坑,你中招了么?
  5. bios是固定在微型计算机上的一块RoM,计算机硬件笔试试题
  6. 车纷享:基于阿里云HBase构建车联网平台实践
  7. eclipse aop连接点joinpoint方法点不出来_(面试必备)你必须要懂的Spring-Aop
  8. ModuleNotFoundError: No module named ‘exceptions‘
  9. ABP(现代ASP.NET样板开发框架)系列之20、ABP展现层——动态生成WebApi
  10. JSON cannot be resolved 解决方法
  11. java实现数据库回滚,java 数据库操作,事宜回滚
  12. 计算机毕业论文画图软件,计算机科学与技术专业毕业论文(绘图软件的设计).doc...
  13. 无人驾驶系统基本框架
  14. 5个小众视频素材网站,你知道吗?
  15. android 8.0 图标规范,Android 8.0自适应图标
  16. 计算机如何执行(运行)程序
  17. halcon学习之路
  18. 地图标注源码 php,百度地图设置坐标,标注坐标地理位置
  19. Hutool Java 工具类库Excel导出,配置宽度自适应极度舒适
  20. TOM游戏h5营销案例分析-高空运鸡蛋

热门文章

  1. 十月百度,阿里巴巴,迅雷搜狗最新面试十一题
  2. java导出pdf 含图片_【Java】itext根据模板生成pdf(包括图片和表格)
  3. 在div中加本地html,div加载另一个HTML页面
  4. Pika的设计及实现
  5. 单独备份config配置文件 (来自老梁邮件)
  6. 知识图谱从入门到应用——知识图谱的知识表示:基础知识
  7. python中列表的使用
  8. html rfftq15.gif,STM32F4系列完整固件库
  9. windows server 2012 DHCP
  10. Java 使用 throw 抛出异常