4、负载均衡

4.1、SMP负载均衡

4.1.1、Scheduling Domains

4.1.1.1、Scheduling Domains概念

借用Linux Scheduling Domains的描述,阐述Scheduling Domains的概念。

一个复杂的高端系统由上到下可以这样构成:

  • 1、它是一个 NUMA 架构的系统,系统中的每个 Node 访问系统中不同区域的内存有不同的速度。
  • 2、同时它又是一个 SMP 系统。由多个物理 CPU(Physical Package) 构成。这些物理 CPU 共享系统中所有的内存。但都有自己独立的 Cache 。
  • 3、每个物理 CPU 又由多个核 (Core) 构成,即 Multi-core 技术或者叫 Chip-level Multi processor(CMP) 。这些核都被集成在一块 die 里面。一般有自己独立的 L1 Cache,但可能共享 L2 Cache 。
  • 4、每个核中又通过 SMT 之类的技术实现多个硬件线程,或者叫 Virtual CPU( 比如 Intel 的 Hyper-threading 技术 ) 。这些硬件线程,逻辑上看是就是一个 CPU 。它们之间几乎所有的东西都共享。包括 L1 Cache,甚至是逻辑运算单元 (ALU) 以及 Power 。

可以看到cpu是有多个层级的,cpu和越近的层级之间共享的资源越多。所以进程在cpu之间迁移是有代价的,从性能的角度看,迁移跨越的层级越大性能损失越大。另外还需要从功耗的角度来考虑进程迁移的代价,这就是EAS考虑的。

4.1.1.2、arm64 cpu_topology

arm64架构的cpu拓扑结构存储在cpu_topology[]变量当中:

/** cpu topology table*/
struct cpu_topology cpu_topology[NR_CPUS];struct cpu_topology {int thread_id;int core_id;int cluster_id;                 // 本cpu所在的clusterunsigned int partno;cpumask_t thread_sibling;cpumask_t core_sibling;         // 在MutiCore层次(即同一个cluster中),有哪些兄弟cpu
};

cpu_topology[]是parse_dt_cpu_capacity()函数解析dts中的信息建立的:

kernel_init() -> kernel_init_freeable() -> smp_prepare_cpus() -> init_cpu_topology() -> parse_dt_topology()↓static int __init parse_dt_topology(void)
{struct device_node *cn, *map;int ret = 0;int cpu;/* (1) 找到dts中cpu topology的根节点"/cpus"" */cn = of_find_node_by_path("/cpus");if (!cn) {pr_err("No CPU information found in DT\n");return 0;}/** When topology is provided cpu-map is essentially a root* cluster with restricted subnodes.*//* (2) 找到"cpu-map"节点 */map = of_get_child_by_name(cn, "cpu-map");if (!map)goto out;/* (3) 解析"cpu-map"中的cluster */ret = parse_cluster(map, 0);if (ret != 0)goto out_map;/** Check that all cores are in the topology; the SMP code will* only mark cores described in the DT as possible.*/for_each_possible_cpu(cpu)if (cpu_topology[cpu].cluster_id == -1)ret = -EINVAL;out_map:of_node_put(map);
out:of_node_put(cn);return ret;
}|→static int __init parse_cluster(struct device_node *cluster, int depth)
{char name[10];bool leaf = true;bool has_cores = false;struct device_node *c;static int cluster_id __initdata;int core_id = 0;int i, ret;/** First check for child clusters; we currently ignore any* information about the nesting of clusters and present the* scheduler with a flat list of them.*/i = 0;/* (3.1) 如果有多级cluster,继续递归搜索 */do {snprintf(name, sizeof(name), "cluster%d", i);c = of_get_child_by_name(cluster, name);if (c) {leaf = false;ret = parse_cluster(c, depth + 1);of_node_put(c);if (ret != 0)return ret;}i++;} while (c);/* Now check for cores */i = 0;do {/* (3.2) 或者core层次的节点 */snprintf(name, sizeof(name), "core%d", i);c = of_get_child_by_name(cluster, name);if (c) {has_cores = true;if (depth == 0) {pr_err("%s: cpu-map children should be clusters\n",c->full_name);of_node_put(c);return -EINVAL;}if (leaf) {/* (3.3) 如果是叶子cluster节点,继续遍历core中的cpu节点 */ret = parse_core(c, cluster_id, core_id++);} else {pr_err("%s: Non-leaf cluster with core %s\n",cluster->full_name, name);ret = -EINVAL;}of_node_put(c);if (ret != 0)return ret;}i++;} while (c);if (leaf && !has_cores)pr_warn("%s: empty cluster\n", cluster->full_name);if (leaf)cluster_id++;return 0;
}||→static int __init parse_core(struct device_node *core, int cluster_id,int core_id)
{char name[10];bool leaf = true;int i = 0;int cpu;struct device_node *t;do {/* (3.3.1) 如果存在thread层级,解析thread和cpu层级 */snprintf(name, sizeof(name), "thread%d", i);t = of_get_child_by_name(core, name);if (t) {leaf = false;cpu = get_cpu_for_node(t);if (cpu >= 0) {cpu_topology[cpu].cluster_id = cluster_id;cpu_topology[cpu].core_id = core_id;cpu_topology[cpu].thread_id = i;} else {pr_err("%s: Can't get CPU for thread\n",t->full_name);of_node_put(t);return -EINVAL;}of_node_put(t);}i++;} while (t);/* (3.3.2) 否则直接解析cpu层级 */cpu = get_cpu_for_node(core);if (cpu >= 0) {if (!leaf) {pr_err("%s: Core has both threads and CPU\n",core->full_name);return -EINVAL;}/* (3.3.3) 得到了cpu的cluster_id/core_id */cpu_topology[cpu].cluster_id = cluster_id;cpu_topology[cpu].core_id = core_id;} else if (leaf) {pr_err("%s: Can't get CPU for leaf core\n", core->full_name);return -EINVAL;}return 0;
}|||→static int __init get_cpu_for_node(struct device_node *node)
{struct device_node *cpu_node;int cpu;cpu_node = of_parse_phandle(node, "cpu", 0);if (!cpu_node)return -1;for_each_possible_cpu(cpu) {if (of_get_cpu_node(cpu, NULL) == cpu_node) {of_node_put(cpu_node);return cpu;}}pr_crit("Unable to find CPU node for %s\n", cpu_node->full_name);of_node_put(cpu_node);return -1;
}

cpu同一层次的关系cpu_topology[cpu].core_sibling/thread_sibling会在update_siblings_masks()中更新:

kernel_init() -> kernel_init_freeable() -> smp_prepare_cpus() -> store_cpu_topology() -> update_siblings_masks()↓static void update_siblings_masks(unsigned int cpuid)
{struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];int cpu;/* update core and thread sibling masks */for_each_possible_cpu(cpu) {cpu_topo = &cpu_topology[cpu];if (cpuid_topo->cluster_id != cpu_topo->cluster_id)continue;cpumask_set_cpu(cpuid, &cpu_topo->core_sibling);if (cpu != cpuid)cpumask_set_cpu(cpu, &cpuid_topo->core_sibling);if (cpuid_topo->core_id != cpu_topo->core_id)continue;cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling);if (cpu != cpuid)cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling);}
}

以mt6799为例,topology为”4*A35 + 4*A53 + 2*A73”,dts中定义如下:

mt6799.dtsi:cpus {#address-cells = <1>;#size-cells = <0>;cpu0: cpu@0 {device_type = "cpu";compatible = "arm,cortex-a35";reg = <0x000>;enable-method = "psci";cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,<&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;cpu-release-addr = <0x0 0x40000200>;clock-frequency = <1248000000>;};cpu1: cpu@001 {device_type = "cpu";compatible = "arm,cortex-a35";reg = <0x001>;enable-method = "psci";cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,<&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;cpu-release-addr = <0x0 0x40000200>;clock-frequency = <1248000000>;};cpu2: cpu@002 {device_type = "cpu";compatible = "arm,cortex-a35";reg = <0x002>;enable-method = "psci";cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,<&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;cpu-release-addr = <0x0 0x40000200>;clock-frequency = <1248000000>;};cpu3: cpu@003 {device_type = "cpu";compatible = "arm,cortex-a35";reg = <0x003>;enable-method = "psci";cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,<&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;cpu-release-addr = <0x0 0x40000200>;clock-frequency = <1248000000>;};cpu4: cpu@100 {device_type = "cpu";compatible = "arm,cortex-a53";reg = <0x100>;enable-method = "psci";cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,<&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;cpu-release-addr = <0x0 0x40000200>;clock-frequency = <1378000000>;};cpu5: cpu@101 {device_type = "cpu";compatible = "arm,cortex-a53";reg = <0x101>;enable-method = "psci";cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,<&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;cpu-release-addr = <0x0 0x40000200>;clock-frequency = <1378000000>;};cpu6: cpu@102 {device_type = "cpu";compatible = "arm,cortex-a53";reg = <0x102>;enable-method = "psci";cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,<&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;cpu-release-addr = <0x0 0x40000200>;clock-frequency = <1378000000>;};cpu7: cpu@103 {device_type = "cpu";compatible = "arm,cortex-a53";reg = <0x103>;enable-method = "psci";cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,<&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;cpu-release-addr = <0x0 0x40000200>;clock-frequency = <1378000000>;};cpu8: cpu@200 {device_type = "cpu";compatible = "arm,cortex-a73";reg = <0x200>;enable-method = "psci";cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,<&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;cpu-release-addr = <0x0 0x40000200>;clock-frequency = <1638000000>;};cpu9: cpu@201 {device_type = "cpu";compatible = "arm,cortex-a73";reg = <0x201>;enable-method = "psci";cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,<&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;cpu-release-addr = <0x0 0x40000200>;clock-frequency = <1638000000>;};cpu-map {cluster0 {core0 {cpu = <&cpu0>;};core1 {cpu = <&cpu1>;};core2 {cpu = <&cpu2>;};core3 {cpu = <&cpu3>;};};cluster1 {core0 {cpu = <&cpu4>;};core1 {cpu = <&cpu5>;};core2 {cpu = <&cpu6>;};core3 {cpu = <&cpu7>;};};cluster2 {core0 {cpu = <&cpu8>;};core1 {cpu = <&cpu9>;};};};
  • 经过parse_dt_topology()、update_siblings_masks()解析后得到cpu_topology[}的值为:
cpu 0 cluster_id = 0, core_id = 0, core_sibling = 0xf
cpu 1 cluster_id = 0, core_id = 1, core_sibling = 0xf
cpu 2 cluster_id = 0, core_id = 2, core_sibling = 0xf
cpu 3 cluster_id = 0, core_id = 3, core_sibling = 0xf
cpu 4 cluster_id = 1, core_id = 0, core_sibling = 0xf0
cpu 5 cluster_id = 1, core_id = 1, core_sibling = 0xf0
cpu 6 cluster_id = 1, core_id = 2, core_sibling = 0xf0
cpu 7 cluster_id = 1, core_id = 3, core_sibling = 0xf0
cpu 8 cluster_id = 2, core_id = 0, core_sibling = 0x300
cpu 9 cluster_id = 2, core_id = 1, core_sibling = 0x300

4.1.1.3、Scheduling Domains的初始化

在kernel_init_freeable()中,调用smp_prepare_cpus()初始化完cpu的拓扑关系,再调用smp_init()唤醒cpu,紧接会调用sched_init_smp()初始化系统的Scheduling Domains。

关于拓扑的层次默认可选的有3层:SMT/MC/DIE。arm目前不支持多线程技术,所以现在只支持2层:MC/DIE。

/** Topology list, bottom-up.*/
static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
#endif
#ifdef CONFIG_SCHED_MC{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
#endif{ cpu_cpu_mask, SD_INIT_NAME(DIE) },{ NULL, },
};

arm64使用的SDTL如下:

static struct sched_domain_topology_level arm64_topology[] = {
#ifdef CONFIG_SCHED_MC{ cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) },
#endif{ cpu_cpu_mask, NULL, cpu_cluster_energy, SD_INIT_NAME(DIE) },{ NULL, },
};

具体的Scheduling Domains的初始化代码分析如下:

kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> init_sched_domains(cpu_active_mask):↓static int init_sched_domains(const struct cpumask *cpu_map)
{int err;arch_update_cpu_topology();/* (1) 当前只有一个schedule domain需要初始化 */ndoms_cur = 1;doms_cur = alloc_sched_domains(ndoms_cur);if (!doms_cur)doms_cur = &fallback_doms;/* (2) 按照传入的cpu_active_mask,构造sched_domains */cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);err = build_sched_domains(doms_cur[0], NULL);/* (3) 注册“/proc/sys/kernel/sched_domain/” */register_sched_domain_sysctl();return err;
}|→static int build_sched_domains(const struct cpumask *cpu_map,struct sched_domain_attr *attr)
{enum s_alloc alloc_state;struct sched_domain *sd;struct s_data d;struct rq *rq = NULL;int i, ret = -ENOMEM;/* (2.1) 在每个tl层次,给每个cpu分配sd、sg、sgc空间 */alloc_state = __visit_domain_allocation_hell(&d, cpu_map);if (alloc_state != sa_rootdomain)goto error;/* Set up domains for cpus specified by the cpu_map. */for_each_cpu(i, cpu_map) {struct sched_domain_topology_level *tl;sd = NULL;for_each_sd_topology(tl) {/* (2.2) 初始化sd构造其不同tl之间的sd的parent、cild关系按照SDTL传入的tl->mask()函数,给sd->span[]赋值*/sd = build_sched_domain(tl, cpu_map, attr, sd, i);/* (2.2.1) 将最底层tl的sd赋值给d.sd */if (tl == sched_domain_topology)*per_cpu_ptr(d.sd, i) = sd;if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))sd->flags |= SD_OVERLAP;if (cpumask_equal(cpu_map, sched_domain_span(sd)))break;}}/* Build the groups for the domains */for_each_cpu(i, cpu_map) {for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {/* (2.3) 给sd->span_weight赋值 */sd->span_weight = cpumask_weight(sched_domain_span(sd));if (sd->flags & SD_OVERLAP) {if (build_overlap_sched_groups(sd, i))goto error;} else {/* (2.4) 按照span,构造每个tl层次中,sd、sg之间的关系 */if (build_sched_groups(sd, i))goto error;}}}/* Calculate CPU capacity for physical packages and nodes */for (i = nr_cpumask_bits-1; i >= 0; i--) {struct sched_domain_topology_level *tl = sched_domain_topology;if (!cpumask_test_cpu(i, cpu_map))continue;for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) {/* (2.5) 初始化sg->sge对应的energy表 */init_sched_energy(i, sd, tl->energy);/* (2.6) 对有人引用的sd、sg、sgc进行标识,无人引用的sd、sg、sgc在__free_domain_allocs()中会被释放*/claim_allocations(i, sd);/* (2.7) 初始化每个tl层级的sgc->capacity*/init_sched_groups_capacity(i, sd);}}/* Attach the domains */rcu_read_lock();/* (2.8) 将d.rd赋值给rq->sd将d.rd赋值给rq->rd*/for_each_cpu(i, cpu_map) {rq = cpu_rq(i);sd = *per_cpu_ptr(d.sd, i);cpu_attach_domain(sd, d.rd, i);}rcu_read_unlock();ret = 0;
error:/* (2.9) free掉分配失败/分配成功多余的内存 */__free_domain_allocs(&d, alloc_state, cpu_map);return ret;
}||→static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,const struct cpumask *cpu_map)
{memset(d, 0, sizeof(*d));/* (2.1.1) 每个tl层次,给每个cpu都分配sd、sg、sgc,tl->data->sd、l->data->sg、l->data->sgc*/if (__sdt_alloc(cpu_map))return sa_sd_storage;/* (2.1.2) 分配d->sd指针空间实际d->sd会指向最底层tl的tl->data->sd*/d->sd = alloc_percpu(struct sched_domain *);if (!d->sd)return sa_sd_storage;/* (2.1.3) 分配d->rd的指针空间和实际空间 rd = root_domain*/d->rd = alloc_rootdomain();if (!d->rd)return sa_sd;return sa_rootdomain;
}||→struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,const struct cpumask *cpu_map, struct sched_domain_attr *attr,struct sched_domain *child, int cpu)
{struct sched_domain *sd = sd_init(tl, cpu);if (!sd)return child;/* (2.2.1) 根据tl->mask()初始化sd->sapn[] */cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));if (child) {sd->level = child->level + 1;sched_domain_level_max = max(sched_domain_level_max, sd->level);/* (2.2.2) 如果有多层tl,建立起sd之间的parent/child关系,对arm来说:MC层tl->data->sd是child,DIE层tl->data->sd是parent*/child->parent = sd;sd->child = child;if (!cpumask_subset(sched_domain_span(child),sched_domain_span(sd))) {pr_err("BUG: arch topology borken\n");
#ifdef CONFIG_SCHED_DEBUGpr_err("     the %s domain not a subset of the %s domain\n",child->name, sd->name);
#endif/* Fixup, ensure @sd has at least @child cpus. */cpumask_or(sched_domain_span(sd),sched_domain_span(sd),sched_domain_span(child));}}set_domain_attribute(sd, attr);return sd;
}||→static int
build_sched_groups(struct sched_domain *sd, int cpu)
{struct sched_group *first = NULL, *last = NULL;struct sd_data *sdd = sd->private;const struct cpumask *span = sched_domain_span(sd);struct cpumask *covered;int i;/* (2.4.1) 根据sd->span[]建立起sd、sg之间的关系 ,如果sd没有child,每个cpu的sd、sg之间建立链接如果sd有child,每个cpu的sd和span中第一个cpu的sg建立链接*/get_group(cpu, sdd, &sd->groups);atomic_inc(&sd->groups->ref);if (cpu != cpumask_first(span))return 0;lockdep_assert_held(&sched_domains_mutex);covered = sched_domains_tmpmask;cpumask_clear(covered);/* (2.4.2) 挑选有sd链接的sg,给其中的sg->cpumask[]成员赋值 */for_each_cpu(i, span) {struct sched_group *sg;int group, j;if (cpumask_test_cpu(i, covered))continue;group = get_group(i, sdd, &sg);cpumask_setall(sched_group_mask(sg));for_each_cpu(j, span) {if (get_group(j, sdd, NULL) != group)continue;cpumask_set_cpu(j, covered);cpumask_set_cpu(j, sched_group_cpus(sg));}/* (2.4.3) 挑选有sd链接的sg,将同一层级sg链接成链表, */if (!first)first = sg;if (last)last->next = sg;last = sg;}last->next = first;return 0;
}||→static void init_sched_energy(int cpu, struct sched_domain *sd,sched_domain_energy_f fn)
{if (!(fn && fn(cpu)))return;if (cpu != group_balance_cpu(sd->groups))return;if (sd->child && !sd->child->groups->sge) {pr_err("BUG: EAS setup broken for CPU%d\n", cpu);
#ifdef CONFIG_SCHED_DEBUGpr_err("     energy data on %s but not on %s domain\n",sd->name, sd->child->name);
#endifreturn;}check_sched_energy_data(cpu, fn, sched_group_cpus(sd->groups));/* (2.5.1) 不同层级tl,按照tl->energy()给sg->sge赋值 */sd->groups->sge = fn(cpu);
}||→static void claim_allocations(int cpu, struct sched_domain *sd)
{struct sd_data *sdd = sd->private;/* (2.6.1) 对有人使用的tl->data->sd、tl->data->sg、tl->data->sgc置空,无人使用的空间,将会在__free_domain_allocs()中被释放*/WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);*per_cpu_ptr(sdd->sd, cpu) = NULL;if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))*per_cpu_ptr(sdd->sg, cpu) = NULL;if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))*per_cpu_ptr(sdd->sgc, cpu) = NULL;
}||→static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
{struct sched_group *sg = sd->groups;WARN_ON(!sg);do {/* (2.7.1) 更新sg->group_weight的值 */sg->group_weight = cpumask_weight(sched_group_cpus(sg));sg = sg->next;} while (sg != sd->groups);if (cpu != group_balance_cpu(sg))return;/* (2.7.2) 更新sgc->capacity的值 */update_group_capacity(sd, cpu);/* (2.7.3) 更新sgc->nr_busy_cpus的值 */atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
}|||→void update_group_capacity(struct sched_domain *sd, int cpu)
{struct sched_domain *child = sd->child;struct sched_group *group, *sdg = sd->groups;unsigned long capacity;unsigned long interval;interval = msecs_to_jiffies(sd->balance_interval);interval = clamp(interval, 1UL, max_load_balance_interval);sdg->sgc->next_update = jiffies + interval;if (!child) {/* (2.7.2.1) 如果sd没有child是最底层tl,则调用arch_scale_cpu_capacity()获取最大运算能力,并减去rt进程的消耗rq->rt_avg,得到本sd的sg->sgc->capacity*/update_cpu_capacity(sd, cpu);return;}capacity = 0;if (child->flags & SD_OVERLAP) {/** SD_OVERLAP domains cannot assume that child groups* span the current group.*/for_each_cpu(cpu, sched_group_cpus(sdg)) {struct sched_group_capacity *sgc;struct rq *rq = cpu_rq(cpu);/** build_sched_domains() -> init_sched_groups_capacity()* gets here before we've attached the domains to the* runqueues.** Use capacity_of(), which is set irrespective of domains* in update_cpu_capacity().** This avoids capacity from being 0 and* causing divide-by-zero issues on boot.*/if (unlikely(!rq->sd)) {capacity += capacity_of(cpu);continue;}sgc = rq->sd->groups->sgc;capacity += sgc->capacity;}} else  {/** !SD_OVERLAP domains can assume that child groups* span the current group.*/ /*  (2.7.2.2) 如果sd有child不是最底层tl,则sgc->capacity等于所有child sg的group->sgc->capacity的和*/group = child->groups;do {capacity += group->sgc->capacity;group = group->next;} while (group != child->groups);}sdg->sgc->capacity = capacity;
}||||→static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);struct sched_group *sdg = sd->groups;struct max_cpu_capacity *mcc;unsigned long max_capacity;int max_cap_cpu;unsigned long flags;/* (2.7.2.1.1) 根据arch_scale_cpu_capacity获取到本cpu最大/orig capacity*/cpu_rq(cpu)->cpu_capacity_orig = capacity;mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;raw_spin_lock_irqsave(&mcc->lock, flags);max_capacity = mcc->val;max_cap_cpu = mcc->cpu;if ((max_capacity > capacity && max_cap_cpu == cpu) ||(max_capacity < capacity)) {mcc->val = capacity;mcc->cpu = cpu;
#ifdef CONFIG_SCHED_DEBUGraw_spin_unlock_irqrestore(&mcc->lock, flags);/* pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity); */goto skip_unlock;
#endif}raw_spin_unlock_irqrestore(&mcc->lock, flags);skip_unlock: __attribute__ ((unused));/* (2.7.2.1.2) 减去rt消耗的capacity,rq->rt_avg/(sched_avg_period() + delta)是rt进程占用cpu的比例,剩下就为cfs可用的capacity*/capacity *= scale_rt_capacity(cpu);capacity >>= SCHED_CAPACITY_SHIFT;if (!capacity)capacity = 1;cpu_rq(cpu)->cpu_capacity = capacity;sdg->sgc->capacity = capacity;
}

init_sched_domains()是在系统启动时创建sched_domain,如果发生cpu hotplug系统中online的cpu发生变化时,会调用partition_sched_domains重新构造系统的sched_domain。

cpu_up() -> _cpu_up() -> __raw_notifier_call_chain() -> cpuset_cpu_active() -> cpuset_update_active_cpus() -> partition_sched_domains() -> build_sched_domains();void __init sched_init_smp(void)
{hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);}static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,void *hcpu)
{switch (action) {case CPU_ONLINE_FROZEN:case CPU_DOWN_FAILED_FROZEN:/** num_cpus_frozen tracks how many CPUs are involved in suspend* resume sequence. As long as this is not the last online* operation in the resume sequence, just build a single sched* domain, ignoring cpusets.*/num_cpus_frozen--;if (likely(num_cpus_frozen)) {partition_sched_domains(1, NULL, NULL);break;}/** This is the last CPU online operation. So fall through and* restore the original sched domains by considering the* cpuset configurations.*/case CPU_ONLINE:cpuset_update_active_cpus(true);break;default:return NOTIFY_DONE;}return NOTIFY_OK;
}static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,void *hcpu)
{unsigned long flags;long cpu = (long)hcpu;struct dl_bw *dl_b;bool overflow;int cpus;switch (action) {case CPU_DOWN_PREPARE:rcu_read_lock_sched();dl_b = dl_bw_of(cpu);raw_spin_lock_irqsave(&dl_b->lock, flags);cpus = dl_bw_cpus(cpu);overflow = __dl_overflow(dl_b, cpus, 0, 0);raw_spin_unlock_irqrestore(&dl_b->lock, flags);rcu_read_unlock_sched();if (overflow)return notifier_from_errno(-EBUSY);cpuset_update_active_cpus(false);break;case CPU_DOWN_PREPARE_FROZEN:num_cpus_frozen++;partition_sched_domains(1, NULL, NULL);break;default:return NOTIFY_DONE;}return NOTIFY_OK;
}

4.1.1.4、mt6799的Scheduling Domains

在系统初始化时,因为cmdline中传入了“maxcpus=8”所以setup_max_cpus=8,smp只是启动了8个核,mt6799的另外2个大核是在后面才启动的。我们看看在系统启动8个核的时候,Scheduling Domains是什么样的。

在启动的时候每个层次的tl对每个cpu都会分配sd、sg、sgc的内存空间,但是建立起有效链接后有些sg、sgc空间是没有用上的。没有使用的内存后面会在claim_allocations()中标识出来,build_sched_domains()函数返回之前调用__free_domain_allocs()释放掉。

kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> init_sched_domains() -> build_sched_domains() ->  __visit_domain_allocation_hell() -> __sdt_alloc():[__sdt_alloc][tl MC] cpu0, &sd = 0xffffffc15663c600, &sg = 0xffffffc156062600, &sgc = 0xffffffc156062780
[__sdt_alloc][tl MC] cpu1, &sd = 0xffffffc15608f000, &sg = 0xffffffc156056780, &sgc = 0xffffffc156090000
[__sdt_alloc][tl MC] cpu2, &sd = 0xffffffc15608fc00, &sg = 0xffffffc156090d80, &sgc = 0xffffffc156090180
[__sdt_alloc][tl MC] cpu3, &sd = 0xffffffc15608f300, &sg = 0xffffffc156090c00, &sgc = 0xffffffc156090300
[__sdt_alloc][tl MC] cpu4, &sd = 0xffffffc15608f900, &sg = 0xffffffc156090a80, &sgc = 0xffffffc156090480
[__sdt_alloc][tl MC] cpu5, &sd = 0xffffffc15608f600, &sg = 0xffffffc156090900, &sgc = 0xffffffc156090600
[__sdt_alloc][tl MC] cpu6, &sd = 0xffffffc156091000, &sg = 0xffffffc156090780, &sgc = 0xffffffc156092000
[__sdt_alloc][tl MC] cpu7, &sd = 0xffffffc156091c00, &sg = 0xffffffc156092d80, &sgc = 0xffffffc156092180 [__sdt_alloc][tl DIE] cpu0, &sd = 0xffffffc156091300, &sg = 0xffffffc156092c00, &sgc = 0xffffffc156092300
[__sdt_alloc][tl DIE] cpu1, &sd = 0xffffffc156091900, &sg = 0xffffffc156092a80, &sgc = 0xffffffc156092480
[__sdt_alloc][tl DIE] cpu2, &sd = 0xffffffc156091600, &sg = 0xffffffc156092900, &sgc = 0xffffffc156092600
[__sdt_alloc][tl DIE] cpu3, &sd = 0xffffffc156093000, &sg = 0xffffffc156092780, &sgc = 0xffffffc156094000
[__sdt_alloc][tl DIE] cpu4, &sd = 0xffffffc156093c00, &sg = 0xffffffc156094d80, &sgc = 0xffffffc156094180
[__sdt_alloc][tl DIE] cpu5, &sd = 0xffffffc156093300, &sg = 0xffffffc156094c00, &sgc = 0xffffffc156094300
[__sdt_alloc][tl DIE] cpu6, &sd = 0xffffffc156093900, &sg = 0xffffffc156094a80, &sgc = 0xffffffc156094480
[__sdt_alloc][tl DIE] cpu7, &sd = 0xffffffc156093600, &sg = 0xffffffc156094900, &sgc = 0xffffffc156094600 

建立链接以后每个层次tl的sd、sg之间的关系:

kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> init_sched_domains() -> build_sched_domains() -> build_sched_groups():[build_sched_domains][tl MC] cpu0, sd->groups=0xffffffc156062600, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf
[build_sched_domains][tl MC] cpu0, sg->sgc=0xffffffc156062780, sg->next=0xffffffc156056780, sg->group_weight=0, sg->cpumask[]=0x1
[build_sched_domains][tl MC] cpu0, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl MC] cpu0, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0
[build_sched_domains][tl MC] cpu0, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|
[build_sched_domains][tl MC] cpu1, sd->groups=0xffffffc156056780, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf
[build_sched_domains][tl MC] cpu1, sg->sgc=0xffffffc156090000, sg->next=0xffffffc156090d80, sg->group_weight=0, sg->cpumask[]=0x2
[build_sched_domains][tl MC] cpu1, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl MC] cpu1, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0
[build_sched_domains][tl MC] cpu1, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|
[build_sched_domains][tl MC] cpu2, sd->groups=0xffffffc156090d80, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf
[build_sched_domains][tl MC] cpu2, sg->sgc=0xffffffc156090180, sg->next=0xffffffc156090c00, sg->group_weight=0, sg->cpumask[]=0x4
[build_sched_domains][tl MC] cpu2, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl MC] cpu2, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0
[build_sched_domains][tl MC] cpu2, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|
[build_sched_domains][tl MC] cpu3, sd->groups=0xffffffc156090c00, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf
[build_sched_domains][tl MC] cpu3, sg->sgc=0xffffffc156090300, sg->next=0xffffffc156062600, sg->group_weight=0, sg->cpumask[]=0x8
[build_sched_domains][tl MC] cpu3, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl MC] cpu3, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0
[build_sched_domains][tl MC] cpu3, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|
[build_sched_domains][tl MC] cpu4, sd->groups=0xffffffc156090a80, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf0
[build_sched_domains][tl MC] cpu4, sg->sgc=0xffffffc156090480, sg->next=0xffffffc156090900, sg->group_weight=0, sg->cpumask[]=0x10
[build_sched_domains][tl MC] cpu4, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl MC] cpu4, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0
[build_sched_domains][tl MC] cpu4, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|
[build_sched_domains][tl MC] cpu5, sd->groups=0xffffffc156090900, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf0
[build_sched_domains][tl MC] cpu5, sg->sgc=0xffffffc156090600, sg->next=0xffffffc156090780, sg->group_weight=0, sg->cpumask[]=0x20
[build_sched_domains][tl MC] cpu5, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl MC] cpu5, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0
[build_sched_domains][tl MC] cpu5, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|
[build_sched_domains][tl MC] cpu6, sd->groups=0xffffffc156090780, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf0
[build_sched_domains][tl MC] cpu6, sg->sgc=0xffffffc156092000, sg->next=0xffffffc156092d80, sg->group_weight=0, sg->cpumask[]=0x40
[build_sched_domains][tl MC] cpu6, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl MC] cpu6, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0
[build_sched_domains][tl MC] cpu6, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|
[build_sched_domains][tl MC] cpu7, sd->groups=0xffffffc156092d80, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf0
[build_sched_domains][tl MC] cpu7, sg->sgc=0xffffffc156092180, sg->next=0xffffffc156090a80, sg->group_weight=0, sg->cpumask[]=0x80
[build_sched_domains][tl MC] cpu7, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl MC] cpu7, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0
[build_sched_domains][tl MC] cpu7, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|[build_sched_domains][tl DIE] cpu0, sd->groups=0xffffffc156092c00, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff
[build_sched_domains][tl DIE] cpu0, sg->sgc=0xffffffc156092300, sg->next=0xffffffc156094d80, sg->group_weight=0, sg->cpumask[]=0xf
[build_sched_domains][tl DIE] cpu0, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl DIE] cpu0, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1
[build_sched_domains][tl DIE] cpu0, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|
[build_sched_domains][tl DIE] cpu1, sd->groups=0xffffffc156092c00, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff
[build_sched_domains][tl DIE] cpu1, sg->sgc=0x0, sg->next=0xffffffc156092a80, sg->group_weight=0, sg->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu1, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu1, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1
[build_sched_domains][tl DIE] cpu1, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|
[build_sched_domains][tl DIE] cpu2, sd->groups=0xffffffc156092c00, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff
[build_sched_domains][tl DIE] cpu2, sg->sgc=0x0, sg->next=0xffffffc156092900, sg->group_weight=0, sg->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu2, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu2, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1
[build_sched_domains][tl DIE] cpu2, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|
[build_sched_domains][tl DIE] cpu3, sd->groups=0xffffffc156092c00, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff
[build_sched_domains][tl DIE] cpu3, sg->sgc=0x0, sg->next=0xffffffc156092780, sg->group_weight=0, sg->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu3, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu3, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1
[build_sched_domains][tl DIE] cpu3, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|
[build_sched_domains][tl DIE] cpu4, sd->groups=0xffffffc156094d80, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff
[build_sched_domains][tl DIE] cpu4, sg->sgc=0xffffffc156094180, sg->next=0xffffffc156092c00, sg->group_weight=0, sg->cpumask[]=0xf0
[build_sched_domains][tl DIE] cpu4, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl DIE] cpu4, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1
[build_sched_domains][tl DIE] cpu4, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|
[build_sched_domains][tl DIE] cpu5, sd->groups=0xffffffc156094d80, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff
[build_sched_domains][tl DIE] cpu5, sg->sgc=0x0, sg->next=0xffffffc156094c00, sg->group_weight=0, sg->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu5, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu5, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1
[build_sched_domains][tl DIE] cpu5, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|
[build_sched_domains][tl DIE] cpu6, sd->groups=0xffffffc156094d80, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff
[build_sched_domains][tl DIE] cpu6, sg->sgc=0x0, sg->next=0xffffffc156094a80, sg->group_weight=0, sg->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu6, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu6, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1
[build_sched_domains][tl DIE] cpu6, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|
[build_sched_domains][tl DIE] cpu7, sd->groups=0xffffffc156094d80, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff
[build_sched_domains][tl DIE] cpu7, sg->sgc=0x0, sg->next=0xffffffc156094900, sg->group_weight=0, sg->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu7, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu7, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1
[build_sched_domains][tl DIE] cpu7, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|

用图形表达的关系如下:

每个sched_domain中的参数也非常重要,在函数sd_init()中初始化,在smp负载均衡时会频繁的使用这些参数和标志:

sd 参数 tl MC 层级 tl DIE 层级
sd->min_interval 4 8
sd->max_interval 8 16
sd->busy_factor 32 32
sd->imbalance_pct 117 125
sd->cache_nice_tries 1 1
sd->busy_idx 2 2
sd->idle_idx 0 1
sd->newidle_idx 0 0
sd->wake_idx 0 0
sd->forkexec_idx 0 0
sd->span_weight 4 8
sd->balance_interval 4 8
sd->level 0 1
sd->flags 0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES 0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING

update_top_cache_domain()函数中还把常用的一些sd进行了cache,我们通过打印得出每个cache实际对应的层次sd:

cache sd 说明 赋值
sd_busy per_cpu(sd_busy, cpu), 本cpu的tl DIE层级sd
sd_llc per_cpu(sd_llc, cpu), 本cpu的tl MC层级sd
sd_llc_size per_cpu(sd_llc_size, cpu), 4
sd_llc_id per_cpu(sd_llc_id, cpu), 0/4
sd_numa per_cpu(sd_numa, cpu), 0
sd_asym per_cpu(sd_asym, cpu), 0
sd_ea per_cpu(sd_ea, cpu), 本cpu的tl DIE层级sd
sd_scs per_cpu(sd_scs, cpu), 本cpu的tl MC层级sd
static void update_top_cache_domain(int cpu)
{struct sched_domain *sd;struct sched_domain *busy_sd = NULL, *ea_sd = NULL;int id = cpu;int size = 1;sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);if (sd) {id = cpumask_first(sched_domain_span(sd));size = cpumask_weight(sched_domain_span(sd));busy_sd = sd->parent; /* sd_busy */}rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);per_cpu(sd_llc_size, cpu) = size;per_cpu(sd_llc_id, cpu) = id;sd = lowest_flag_domain(cpu, SD_NUMA);rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);sd = highest_flag_domain(cpu, SD_ASYM_PACKING);rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);for_each_domain(cpu, sd) {if (sd->groups->sge)ea_sd = sd;elsebreak;}rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd);sd = highest_flag_domain(cpu, SD_SHARE_CAP_STATES);rcu_assign_pointer(per_cpu(sd_scs, cpu), sd);
}
[update_top_cache_domain] cpu0, sd_busy=0xffffffc156091300, sd_llc=0xffffffc15663c600, sd_llc_size=4, sd_llc_id=0, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156091300, sd_scs=0xffffffc15663c600
[update_top_cache_domain] cpu1, sd_busy=0xffffffc156091900, sd_llc=0xffffffc15608f000, sd_llc_size=4, sd_llc_id=0, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156091900, sd_scs=0xffffffc15608f000
[update_top_cache_domain] cpu2, sd_busy=0xffffffc156091600, sd_llc=0xffffffc15608fc00, sd_llc_size=4, sd_llc_id=0, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156091600, sd_scs=0xffffffc15608fc00
[update_top_cache_domain] cpu3, sd_busy=0xffffffc156093000, sd_llc=0xffffffc15608f300, sd_llc_size=4, sd_llc_id=0, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156093000, sd_scs=0xffffffc15608f300
[update_top_cache_domain] cpu4, sd_busy=0xffffffc156093c00, sd_llc=0xffffffc15608f900, sd_llc_size=4, sd_llc_id=4, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156093c00, sd_scs=0xffffffc15608f900
[update_top_cache_domain] cpu5, sd_busy=0xffffffc156093300, sd_llc=0xffffffc15608f600, sd_llc_size=4, sd_llc_id=4, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156093300, sd_scs=0xffffffc15608f600
[update_top_cache_domain] cpu6, sd_busy=0xffffffc156093900, sd_llc=0xffffffc156091000, sd_llc_size=4, sd_llc_id=4, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156093900, sd_scs=0xffffffc156091000
[update_top_cache_domain] cpu7, sd_busy=0xffffffc156093600, sd_llc=0xffffffc156091c00, sd_llc_size=4, sd_llc_id=4, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156093600, sd_scs=0xffffffc156091c00[__sdt_alloc][tl MC] cpu0, &sd = 0xffffffc15663c600, &sg = 0xffffffc156062600, &sgc = 0xffffffc156062780
[__sdt_alloc][tl MC] cpu1, &sd = 0xffffffc15608f000, &sg = 0xffffffc156056780, &sgc = 0xffffffc156090000
[__sdt_alloc][tl MC] cpu2, &sd = 0xffffffc15608fc00, &sg = 0xffffffc156090d80, &sgc = 0xffffffc156090180
[__sdt_alloc][tl MC] cpu3, &sd = 0xffffffc15608f300, &sg = 0xffffffc156090c00, &sgc = 0xffffffc156090300
[__sdt_alloc][tl MC] cpu4, &sd = 0xffffffc15608f900, &sg = 0xffffffc156090a80, &sgc = 0xffffffc156090480
[__sdt_alloc][tl MC] cpu5, &sd = 0xffffffc15608f600, &sg = 0xffffffc156090900, &sgc = 0xffffffc156090600
[__sdt_alloc][tl MC] cpu6, &sd = 0xffffffc156091000, &sg = 0xffffffc156090780, &sgc = 0xffffffc156092000
[__sdt_alloc][tl MC] cpu7, &sd = 0xffffffc156091c00, &sg = 0xffffffc156092d80, &sgc = 0xffffffc156092180 [__sdt_alloc][tl DIE] cpu0, &sd = 0xffffffc156091300, &sg = 0xffffffc156092c00, &sgc = 0xffffffc156092300
[__sdt_alloc][tl DIE] cpu1, &sd = 0xffffffc156091900, &sg = 0xffffffc156092a80, &sgc = 0xffffffc156092480
[__sdt_alloc][tl DIE] cpu2, &sd = 0xffffffc156091600, &sg = 0xffffffc156092900, &sgc = 0xffffffc156092600
[__sdt_alloc][tl DIE] cpu3, &sd = 0xffffffc156093000, &sg = 0xffffffc156092780, &sgc = 0xffffffc156094000
[__sdt_alloc][tl DIE] cpu4, &sd = 0xffffffc156093c00, &sg = 0xffffffc156094d80, &sgc = 0xffffffc156094180
[__sdt_alloc][tl DIE] cpu5, &sd = 0xffffffc156093300, &sg = 0xffffffc156094c00, &sgc = 0xffffffc156094300
[__sdt_alloc][tl DIE] cpu6, &sd = 0xffffffc156093900, &sg = 0xffffffc156094a80, &sgc = 0xffffffc156094480
[__sdt_alloc][tl DIE] cpu7, &sd = 0xffffffc156093600, &sg = 0xffffffc156094900, &sgc = 0xffffffc156094600 

mt6799在计算功耗(energy)和运算能力(capacity)时使用的表项如下:

kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> init_sched_domains() -> build_sched_domains() -> init_sched_energy()/init_sched_groups_capacity();/* v1 FY */
struct upower_tbl_info upower_tbl_infos_FY[NR_UPOWER_BANK] = {INIT_UPOWER_TBL_INFOS(UPOWER_BANK_LL, upower_tbl_ll_1_FY),INIT_UPOWER_TBL_INFOS(UPOWER_BANK_L, upower_tbl_l_1_FY),INIT_UPOWER_TBL_INFOS(UPOWER_BANK_B, upower_tbl_b_1_FY),INIT_UPOWER_TBL_INFOS(UPOWER_BANK_CLS_LL, upower_tbl_cluster_ll_1_FY),INIT_UPOWER_TBL_INFOS(UPOWER_BANK_CLS_L, upower_tbl_cluster_l_1_FY),INIT_UPOWER_TBL_INFOS(UPOWER_BANK_CLS_B, upower_tbl_cluster_b_1_FY),INIT_UPOWER_TBL_INFOS(UPOWER_BANK_CCI, upower_tbl_cci_1_FY),
};/* ver1 */
/* FY table */
struct upower_tbl upower_tbl_ll_1_FY = {.row = {{.cap = 100, .volt = 75000, .dyn_pwr = 9994, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} },{.cap = 126, .volt = 75000, .dyn_pwr = 12585, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} },{.cap = 148, .volt = 75000, .dyn_pwr = 14806, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} },{.cap = 167, .volt = 75000, .dyn_pwr = 16656, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} },{.cap = 189, .volt = 75000, .dyn_pwr = 18877, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} },{.cap = 212, .volt = 75000, .dyn_pwr = 21098, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} },{.cap = 230, .volt = 75700, .dyn_pwr = 23379, .lkg_pwr = {13936, 13936, 13936, 13936, 13936, 13936} },{.cap = 245, .volt = 78100, .dyn_pwr = 26490, .lkg_pwr = {14811, 14811, 14811, 14811, 14811, 14811} },{.cap = 263, .volt = 81100, .dyn_pwr = 30729, .lkg_pwr = {15958, 15958, 15958, 15958, 15958, 15958} },{.cap = 278, .volt = 83500, .dyn_pwr = 34409, .lkg_pwr = {16949, 16949, 16949, 16949, 16949, 16949} },{.cap = 293, .volt = 86000, .dyn_pwr = 38447, .lkg_pwr = {18036, 18036, 18036, 18036, 18036, 18036} },{.cap = 304, .volt = 88400, .dyn_pwr = 42166, .lkg_pwr = {19159, 19159, 19159, 19159, 19159, 19159} },{.cap = 319, .volt = 90800, .dyn_pwr = 46657, .lkg_pwr = {20333, 20333, 20333, 20333, 20333, 20333} },{.cap = 334, .volt = 93200, .dyn_pwr = 51442, .lkg_pwr = {21605, 21605, 21605, 21605, 21605, 21605} },{.cap = 345, .volt = 95000, .dyn_pwr = 55230, .lkg_pwr = {22560, 22560, 22560, 22560, 22560, 22560} },{.cap = 356, .volt = 97400, .dyn_pwr = 59928, .lkg_pwr = {24002, 24002, 24002, 24002, 24002, 24002} },},.lkg_idx = DEFAULT_LKG_IDX,.row_num = UPOWER_OPP_NUM,.nr_idle_states = NR_UPOWER_CSTATES,.idle_states = {{{0}, {7321} },{{0}, {7321} },{{0}, {7321} },{{0}, {7321} },{{0}, {7321} },{{0}, {7321} },},
};struct upower_tbl upower_tbl_cluster_ll_1_FY = {.row = {{.cap = 100, .volt = 75000, .dyn_pwr = 3656, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} },{.cap = 126, .volt = 75000, .dyn_pwr = 4604, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} },{.cap = 148, .volt = 75000, .dyn_pwr = 5417, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} },{.cap = 167, .volt = 75000, .dyn_pwr = 6094, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} },{.cap = 189, .volt = 75000, .dyn_pwr = 6906, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} },{.cap = 212, .volt = 75000, .dyn_pwr = 7719, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} },{.cap = 230, .volt = 75700, .dyn_pwr = 8553, .lkg_pwr = {22134, 22134, 22134, 22134, 22134, 22134} },{.cap = 245, .volt = 78100, .dyn_pwr = 9692, .lkg_pwr = {23523, 23523, 23523, 23523, 23523, 23523} },{.cap = 263, .volt = 81100, .dyn_pwr = 11242, .lkg_pwr = {25344, 25344, 25344, 25344, 25344, 25344} },{.cap = 278, .volt = 83500, .dyn_pwr = 12589, .lkg_pwr = {26919, 26919, 26919, 26919, 26919, 26919} },{.cap = 293, .volt = 86000, .dyn_pwr = 14066, .lkg_pwr = {28646, 28646, 28646, 28646, 28646, 28646} },{.cap = 304, .volt = 88400, .dyn_pwr = 15427, .lkg_pwr = {30430, 30430, 30430, 30430, 30430, 30430} },{.cap = 319, .volt = 90800, .dyn_pwr = 17069, .lkg_pwr = {32293, 32293, 32293, 32293, 32293, 32293} },{.cap = 334, .volt = 93200, .dyn_pwr = 18820, .lkg_pwr = {34314, 34314, 34314, 34314, 34314, 34314} },{.cap = 345, .volt = 95000, .dyn_pwr = 20206, .lkg_pwr = {35830, 35830, 35830, 35830, 35830, 35830} },{.cap = 356, .volt = 97400, .dyn_pwr = 21925, .lkg_pwr = {38121, 38121, 38121, 38121, 38121, 38121} },},.lkg_idx = DEFAULT_LKG_IDX,.row_num = UPOWER_OPP_NUM,.nr_idle_states = NR_UPOWER_CSTATES,.idle_states = {{{0}, {11628} },{{0}, {11628} },{{0}, {11628} },{{0}, {11628} },{{0}, {11628} },{{0}, {11628} },},
};struct upower_tbl upower_tbl_l_1_FY = {.row = {{.cap = 116, .volt = 75000, .dyn_pwr = 16431, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} },{.cap = 152, .volt = 75000, .dyn_pwr = 21486, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} },{.cap = 179, .volt = 75000, .dyn_pwr = 25278, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} },{.cap = 201, .volt = 75000, .dyn_pwr = 28437, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} },{.cap = 228, .volt = 75000, .dyn_pwr = 32229, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} },{.cap = 255, .volt = 75000, .dyn_pwr = 36021, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} },{.cap = 282, .volt = 75700, .dyn_pwr = 40559, .lkg_pwr = {23423, 23423, 23423, 23423, 23423, 23423} },{.cap = 304, .volt = 78100, .dyn_pwr = 46598, .lkg_pwr = {24968, 24968, 24968, 24968, 24968, 24968} },{.cap = 331, .volt = 81100, .dyn_pwr = 54680, .lkg_pwr = {26999, 26999, 26999, 26999, 26999, 26999} },{.cap = 349, .volt = 83500, .dyn_pwr = 61098, .lkg_pwr = {28760, 28760, 28760, 28760, 28760, 28760} },{.cap = 371, .volt = 86000, .dyn_pwr = 68965, .lkg_pwr = {30698, 30698, 30698, 30698, 30698, 30698} },{.cap = 393, .volt = 88400, .dyn_pwr = 77258, .lkg_pwr = {32706, 32706, 32706, 32706, 32706, 32706} },{.cap = 416, .volt = 90800, .dyn_pwr = 86141, .lkg_pwr = {34808, 34808, 34808, 34808, 34808, 34808} },{.cap = 438, .volt = 93200, .dyn_pwr = 95634, .lkg_pwr = {37097, 37097, 37097, 37097, 37097, 37097} },{.cap = 452, .volt = 95000, .dyn_pwr = 102406, .lkg_pwr = {38814, 38814, 38814, 38814, 38814, 38814} },{.cap = 474, .volt = 97400, .dyn_pwr = 112974, .lkg_pwr = {41424, 41424, 41424, 41424, 41424, 41424} },},.lkg_idx = DEFAULT_LKG_IDX,.row_num = UPOWER_OPP_NUM,.nr_idle_states = NR_UPOWER_CSTATES,.idle_states = {{{0}, {11926} },{{0}, {11926} },{{0}, {11926} },{{0}, {11926} },{{0}, {11926} },{{0}, {11926} },},
};struct upower_tbl upower_tbl_cluster_l_1_FY = {.row = {{.cap = 116, .volt = 75000, .dyn_pwr = 2778, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} },{.cap = 152, .volt = 75000, .dyn_pwr = 3633, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} },{.cap = 179, .volt = 75000, .dyn_pwr = 4274, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} },{.cap = 201, .volt = 75000, .dyn_pwr = 4808, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} },{.cap = 228, .volt = 75000, .dyn_pwr = 5449, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} },{.cap = 255, .volt = 75000, .dyn_pwr = 6090, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} },{.cap = 282, .volt = 75700, .dyn_pwr = 6857, .lkg_pwr = {27058, 27058, 27058, 27058, 27058, 27058} },{.cap = 304, .volt = 78100, .dyn_pwr = 7878, .lkg_pwr = {28843, 28843, 28843, 28843, 28843, 28843} },{.cap = 331, .volt = 81100, .dyn_pwr = 9245, .lkg_pwr = {31188, 31188, 31188, 31188, 31188, 31188} },{.cap = 349, .volt = 83500, .dyn_pwr = 10330, .lkg_pwr = {33223, 33223, 33223, 33223, 33223, 33223} },{.cap = 371, .volt = 86000, .dyn_pwr = 11660, .lkg_pwr = {35461, 35461, 35461, 35461, 35461, 35461} },{.cap = 393, .volt = 88400, .dyn_pwr = 13062, .lkg_pwr = {37781, 37781, 37781, 37781, 37781, 37781} },{.cap = 416, .volt = 90800, .dyn_pwr = 14564, .lkg_pwr = {40209, 40209, 40209, 40209, 40209, 40209} },{.cap = 438, .volt = 93200, .dyn_pwr = 16169, .lkg_pwr = {42854, 42854, 42854, 42854, 42854, 42854} },{.cap = 452, .volt = 95000, .dyn_pwr = 17314, .lkg_pwr = {44837, 44837, 44837, 44837, 44837, 44837} },{.cap = 474, .volt = 97400, .dyn_pwr = 19101, .lkg_pwr = {47852, 47852, 47852, 47852, 47852, 47852} },},.lkg_idx = DEFAULT_LKG_IDX,.row_num = UPOWER_OPP_NUM,.nr_idle_states = NR_UPOWER_CSTATES,.idle_states = {{{0}, {13776} },{{0}, {13776} },{{0}, {13776} },{{0}, {13776} },{{0}, {13776} },{{0}, {13776} },},
};struct upower_tbl upower_tbl_b_1_FY = {.row = {{.cap = 211, .volt = 75000, .dyn_pwr = 61732, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} },{.cap = 268, .volt = 75000, .dyn_pwr = 78352, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} },{.cap = 317, .volt = 75000, .dyn_pwr = 92598, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} },{.cap = 358, .volt = 75000, .dyn_pwr = 104469, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} },{.cap = 406, .volt = 75000, .dyn_pwr = 118715, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} },{.cap = 447, .volt = 75000, .dyn_pwr = 130587, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} },{.cap = 504, .volt = 75700, .dyn_pwr = 149968, .lkg_pwr = {72438, 72438, 72438, 72438, 72438, 72438} },{.cap = 561, .volt = 78100, .dyn_pwr = 177650, .lkg_pwr = {76806, 76806, 76806, 76806, 76806, 76806} },{.cap = 634, .volt = 81100, .dyn_pwr = 216546, .lkg_pwr = {82521, 82521, 82521, 82521, 82521, 82521} },{.cap = 691, .volt = 83500, .dyn_pwr = 250153, .lkg_pwr = {87447, 87447, 87447, 87447, 87447, 87447} },{.cap = 748, .volt = 86000, .dyn_pwr = 287210, .lkg_pwr = {92841, 92841, 92841, 92841, 92841, 92841} },{.cap = 805, .volt = 88400, .dyn_pwr = 326553, .lkg_pwr = {98397, 98397, 98397, 98397, 98397, 98397} },{.cap = 861, .volt = 90800, .dyn_pwr = 368886, .lkg_pwr = {104190, 104190, 104190, 104190, 104190, 104190} },{.cap = 918, .volt = 93200, .dyn_pwr = 414309, .lkg_pwr = {110456, 110456, 110456, 110456, 110456, 110456} },{.cap = 959, .volt = 95000, .dyn_pwr = 449514, .lkg_pwr = {115156, 115156, 115156, 115156, 115156, 115156} },{.cap = 1024, .volt = 97400, .dyn_pwr = 504548, .lkg_pwr = {122224, 122224, 122224, 122224, 122224, 122224} },},.lkg_idx = DEFAULT_LKG_IDX,.row_num = UPOWER_OPP_NUM,.nr_idle_states = NR_UPOWER_CSTATES,.idle_states = {{{0}, {38992} },{{0}, {38992} },{{0}, {38992} },{{0}, {38992} },{{0}, {38992} },{{0}, {38992} },},
};struct upower_tbl upower_tbl_cluster_b_1_FY = {.row = {{.cap = 211, .volt = 75000, .dyn_pwr = 6408, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} },{.cap = 268, .volt = 75000, .dyn_pwr = 8133, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} },{.cap = 317, .volt = 75000, .dyn_pwr = 9612, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} },{.cap = 358, .volt = 75000, .dyn_pwr = 10844, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} },{.cap = 406, .volt = 75000, .dyn_pwr = 12323, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} },{.cap = 447, .volt = 75000, .dyn_pwr = 13555, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} },{.cap = 504, .volt = 75700, .dyn_pwr = 15567, .lkg_pwr = {28054, 28054, 28054, 28054, 28054, 28054} },{.cap = 561, .volt = 78100, .dyn_pwr = 18440, .lkg_pwr = {29746, 29746, 29746, 29746, 29746, 29746} },{.cap = 634, .volt = 81100, .dyn_pwr = 22478, .lkg_pwr = {31959, 31959, 31959, 31959, 31959, 31959} },{.cap = 691, .volt = 83500, .dyn_pwr = 25966, .lkg_pwr = {33867, 33867, 33867, 33867, 33867, 33867} },{.cap = 748, .volt = 86000, .dyn_pwr = 29813, .lkg_pwr = {35956, 35956, 35956, 35956, 35956, 35956} },{.cap = 805, .volt = 88400, .dyn_pwr = 33897, .lkg_pwr = {38108, 38108, 38108, 38108, 38108, 38108} },{.cap = 861, .volt = 90800, .dyn_pwr = 38291, .lkg_pwr = {40351, 40351, 40351, 40351, 40351, 40351} },{.cap = 918, .volt = 93200, .dyn_pwr = 43006, .lkg_pwr = {42778, 42778, 42778, 42778, 42778, 42778} },{.cap = 959, .volt = 95000, .dyn_pwr = 46661, .lkg_pwr = {44598, 44598, 44598, 44598, 44598, 44598} },{.cap = 1024, .volt = 97400, .dyn_pwr = 52373, .lkg_pwr = {47335, 47335, 47335, 47335, 47335, 47335} },},.lkg_idx = DEFAULT_LKG_IDX,.row_num = UPOWER_OPP_NUM,.nr_idle_states = NR_UPOWER_CSTATES,.idle_states = {{{0}, {15101} },{{0}, {15101} },{{0}, {15101} },{{0}, {15101} },{{0}, {15101} },{{0}, {15101} },},
};struct upower_tbl upower_tbl_cci_1_FY = {.row = {{.cap = 0, .volt = 75000, .dyn_pwr = 2708, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} },{.cap = 0, .volt = 75000, .dyn_pwr = 3611, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} },{.cap = 0, .volt = 75000, .dyn_pwr = 4288, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} },{.cap = 0, .volt = 75000, .dyn_pwr = 5191, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} },{.cap = 0, .volt = 75000, .dyn_pwr = 5868, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} },{.cap = 0, .volt = 75000, .dyn_pwr = 6771, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} },{.cap = 0, .volt = 75700, .dyn_pwr = 7588, .lkg_pwr = {16537, 16537, 16537, 16537, 16537, 16537} },{.cap = 0, .volt = 78100, .dyn_pwr = 8811, .lkg_pwr = {17527, 17527, 17527, 17527, 17527, 17527} },{.cap = 0, .volt = 81100, .dyn_pwr = 10292, .lkg_pwr = {18822, 18822, 18822, 18822, 18822, 18822} },{.cap = 0, .volt = 83500, .dyn_pwr = 11750, .lkg_pwr = {19938, 19938, 19938, 19938, 19938, 19938} },{.cap = 0, .volt = 86000, .dyn_pwr = 13354, .lkg_pwr = {21159, 21159, 21159, 21159, 21159, 21159} },{.cap = 0, .volt = 88400, .dyn_pwr = 14737, .lkg_pwr = {22417, 22417, 22417, 22417, 22417, 22417} },{.cap = 0, .volt = 90800, .dyn_pwr = 16540, .lkg_pwr = {23728, 23728, 23728, 23728, 23728, 23728} },{.cap = 0, .volt = 93200, .dyn_pwr = 18472, .lkg_pwr = {25145, 25145, 25145, 25145, 25145, 25145} },{.cap = 0, .volt = 95000, .dyn_pwr = 19916, .lkg_pwr = {26208, 26208, 26208, 26208, 26208, 26208} },{.cap = 0, .volt = 97400, .dyn_pwr = 22077, .lkg_pwr = {27805, 27805, 27805, 27805, 27805, 27805} },},.lkg_idx = DEFAULT_LKG_IDX,.row_num = UPOWER_OPP_NUM,.nr_idle_states = NR_UPOWER_CSTATES,.idle_states = {{{0}, {8938} },{{0}, {8938} },{{0}, {8938} },{{0}, {8938} },{{0}, {8938} },{{0}, {8938} },},
};

4.1.2、smp负载均衡的实现

负载均衡和很多参数相关,下面列出了其中最重要的一些参数:

成员 所属结构 含义 更新/获取函数 计算方法
rq->cpu_capacity_orig rq 本cpu总的计算能力 init_sched_groups_capacity()/update_sd_lb_stats() -> update_group_capacity() -> update_cpu_capacity() capacity = arch_scale_cpu_capacity(sd, cpu)
rq->cpu_capacity rq 本cpu cfs的计算能力 = 总capacity - rt占用的capacity init_sched_groups_capacity()/update_sd_lb_stats() -> update_group_capacity() -> update_cpu_capacity() capacity *= scale_rt_capacity(cpu);
rq->rd->max_cpu_capacity rq->rd root_domain中最大的cpu计算能力 init_sched_groups_capacity()/update_sd_lb_stats() -> update_group_capacity() -> update_cpu_capacity()
rq->rd->overutilized rq->rd update_sd_lb_stats()
rq->rd->overload rq->rd update_sd_lb_stats()
rq->rt_avg rq 本cpu的rt平均负载 weighted_cpuload() -> cfs_rq_runnable_load_avg()
rq->cfs.runnable_load_avg rq->cfs(cfs_rq) 本cpu cfs_rq的runable平均负载 __update_load_avg()、cfs_rq_load_avg() (runnable时间*freq*weight)/LOAD_AVG_MAX
rq->cfs.avg.load_avg rq->cfs.avg 本cpu cfs_rq的runnable平均负载 __update_load_avg() (runnable时间*freq*weight)/LOAD_AVG_MAX
rq->cfs.avg.loadwop_avg rq->cfs.avg 本cpu cfs_rq的runnable平均负载,不含weight __update_load_avg() (runnable时间*freq)/LOAD_AVG_MAX
rq->cfs.avg.util_avg rq->cfs.avg 本cpu cfs_rq的running负载 __update_load_avg()、cpu_util() -> __cpu_util() (running时间*freq*capacity)/LOAD_AVG_MAX
cfs_rq->nr_running cfs_rq 本cfs_rq这个层次runnable的se的数量 enqueue_entity()/dequeue_entity() -> account_entity_enqueue()
cfs_rq->h_nr_running cfs_rq 本cfs_rq包含所有子cfs_rq nr_running的总和 enqueue_task_fair()/dequeue_task_fair
rq->nr_running rq 本cpu rq所有runnable的se的数量,包含所有子cfs_rq enqueue_task_fair()/dequeue_task_fair -> add_nr_running()

4.1.2.1、rebalance_domains()

mtk对定义了3种power模式来兼容EAS的:EAS模式(energy_aware())、HMP模式(sched_feat(SCHED_HMP))、hybrid_support(EAS、HMP同时共存);

hybrid_support()模式下:一般负载均衡交给EAS;如果cpu_rq(cpu)->rd->overutilized负载已经严重不均衡,交给HMP;

系统在scheduler_tick()中会定期的检测smp负载均衡的时间是否已到,如果到时触发SCHED_SOFTIRQ软中断:

void scheduler_tick(void)
{#ifdef CONFIG_SMPrq->idle_balance = idle_cpu(cpu);trigger_load_balance(rq);
#endif}|→/** Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.*/
void trigger_load_balance(struct rq *rq)
{/* Don't need to rebalance while attached to NULL domain */if (unlikely(on_null_domain(rq)))return;if (time_after_eq(jiffies, rq->next_balance))raise_softirq(SCHED_SOFTIRQ);
#ifdef CONFIG_NO_HZ_COMMONif (nohz_kick_needed(rq))nohz_balancer_kick();
#endif
}

SCHED_SOFTIRQ软中断的执行主体为run_rebalance_domains:

__init void init_sched_fair_class(void)
{open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);}/** run_rebalance_domains is triggered when needed from the scheduler tick.* Also triggered for nohz idle balancing (with nohz_balancing_kick set).*/
static void run_rebalance_domains(struct softirq_action *h)
{struct rq *this_rq = this_rq();enum cpu_idle_type idle = this_rq->idle_balance ?CPU_IDLE : CPU_NOT_IDLE;int this_cpu = smp_processor_id();/* bypass load balance of HMP if EAS consideration */if ((!energy_aware() && sched_feat(SCHED_HMP)) ||(hybrid_support() && cpu_rq(this_cpu)->rd->overutilized))hmp_force_up_migration(this_cpu);/** If this cpu has a pending nohz_balance_kick, then do the* balancing on behalf of the other idle cpus whose ticks are* stopped. Do nohz_idle_balance *before* rebalance_domains to* give the idle cpus a chance to load balance. Else we may* load balance only within the local sched_domain hierarchy* and abort nohz_idle_balance altogether if we pull some load.*/nohz_idle_balance(this_rq, idle);rebalance_domains(this_rq, idle);
}

我们分析最核心的函数rebalance_domains():

需要重点提一下的是:负载计算计算了3种负载(load_avg、loadwop_avg、util_avg),rebalance_domains主要使用其中的load_avg,乘(SCHED_CAPACITY_SCALE/capacity)加以转换。

  • 1、逐级轮询本cpu的sd,判断本sd的时间间隔是否到期,如果到期做load_balance();

| tl层级 | cpu_busy? | sd->balance_interval | sd->busy_factor | sd balance interval |
|—|—|—|—|—|
MC层级 | idle | 4 |1 | 4ms
MC层级 | busy | 4 | 32 | 128ms
DIE层级 | idle | 8 |1 | 8ms
DIE层级 | busy | 8 | 32 | 256ms
| | | | | rq->next_balance = min(上述值)

  • 2、在load_balance()中判断在本层级sd本cpu的当前情况是否适合充当dst_cpu,在should_we_balance()做各种判断,做dst_cpu的条件有:要么是本sg的第一个idle cpu,要么是本sg的第一个cpu。dst_cpu是作为目的cpu让负载高的cpu迁移进程过来,如果本cpu不符合条件中断操作;

  • 3、继续find_busiest_group(),在sg链表中找出负载最重的sg。核心计算在update_sd_lb_stats()、update_sg_lb_stats()中。如果dst_cpu所在的local_group负载大于busiest sg,或者大于sds平均负载,中断操作;如果成功计算需要迁移的负载env->imbalance,为min((sds->avg - local), (busiest - sds->avg));

  • 3.1、根据当前cpu的idle状态计算cpu load(rq->cpu_load[])时选用的index值:
tl层级 busy_idx idle_idx newidle_idx
MC层级 2 0 0
DIE层级 2 1 0

- 3.2、计算sg负载sgs,选择sgs->avg_load最大的sg作为busiest_group。其中几个关键值的计算如下:

负载值 计算方法 说明
sgs->group_load += cpu_rq(cpu)->cpu_load[index-1] 累加cpu的load值,相对值(每个cpu的最大值都是1024),且带weight分量
sgs->group_util += cpu_rq(cpu)->cfs.avg.util_avg 累加cpu cfs running值,绝对值(不同cluster,只有最大capacity能力的cpu最大值为1024)
sgs->group_capacity += (arch_scale_cpu_capacity(sd, cpu)*(1-rt_capacity)) 累加cpu的capacity,绝对值(不同cluster,只有最大capacity能力的cpu最大值为1024)
sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity group_load做了转换,和group_capacity成反比

- 3.3、在计算sg负载时,几个关键状态的计算如下:

状态值 计算方法 说明
sgs->group_no_capacity (sgs->group_capacity * 100) < (sgs->group_util * env->sd->imbalance_pct) 预留一定空间(比例为imbalance_pct),sg运算能力已经不够了,sgs->group_type=group_overloaded
dst_rq->rd->overutilized (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin) 预留一定空间(比例为capacity_margin),sg运算能力已经不够了
dst_rq->rd->overload rq->nr_running > 1 sg中任何一个cpu的runnable进程大于1

比例参数imbalance_pct、capacity_margin的值为:

tl层级 sd->imbalance_pct (/100) capacity_margin (/1024)
MC层级 117 1280
DIE层级 125 1280

- 3.4、计算env->imbalance,这个是rebalance需要迁移的负载量:

负载值 计算方法 说明
sds->total_load += sgs->group_load
sds->total_capacity += sgs->group_capacity
sds.avg_load (SCHED_CAPACITY_SCALE * sds.total_load)/ sds.total_capacity
env->imbalance min((busiest->avg_load - sds->avg_load)*busiest->group_capacity, (sds->avg_load - local->avg_load)*local->group_capacity) / SCHED_CAPACITY_SCALE) 感觉这里计算有bug啊,前面是1024/capcity,后面是capacity/1024,很混乱
  • 4、继续find_busiest_queue(),查找busiest sg中负载最重的cpu。

  • 4.1、找出sg中weighted_cpuload*capacity_of值最大的cpu:
负载值 计算方法 说明
weighted_cpuload(cpu) cpu_rq(cpu)->cfs->runnable_load_avg cpu的load值,相对值(每个cpu的最大值都是1024),且带weight分量
capacity_of(cpu) arch_scale_cpu_capacity(sd, cpu)*(1-rt_capacity) cpu cfs running值,绝对值(不同cluster,只有最大capacity能力的cpu最大值为1024)
weighted_cpuload(cpu)*capacity_of(cpu) 最大值为busiest sg中busiest cpu rq
  • 5、迁移busiest cpu的负载到本地dst cpu上,迁移的负载额度为env->imbalance:detach_tasks() -> attach_tasks();

  • 6、处理几种因为进程亲和力问题,busiest cpu不能迁移走足够的进程:LBF_DST_PINNED尝试更改dst_cpu为本地cpu相同sg的其他cpu;LBF_SOME_PINNED当前不能均衡尝试让父sd均衡;LBF_ALL_PINNED一个进程都不能迁移尝试去掉dst_cpu重新进行load_balance();

  • 7、如果经过各种尝试后还是没有一个进程迁移成功,最后尝试一次active_balance;

/** It checks each scheduling domain to see if it is due to be balanced,* and initiates a balancing operation if so.** Balancing parameters are set up in init_sched_domains.* Balance的参数是在sched_domains初始化时设置的*/
static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
{int continue_balancing = 1;int cpu = rq->cpu;unsigned long interval;struct sched_domain *sd;/* 默认本cpu rq下一次的balance时间为60s以后 *//* Earliest time when we have to do rebalance again */unsigned long next_balance = jiffies + 60*HZ;int update_next_balance = 0;int need_serialize, need_decay = 0;u64 max_cost = 0;/* (1) 更新cpu rq中所有cfs_rq的最新负载 */update_blocked_averages(cpu);rcu_read_lock();/* (2) 对本cpu每个层次的schedule_domain进行扫描 */for_each_domain(cpu, sd) {/* (3) 以1HZ的频率对sd->max_newidle_lb_cost进行老化,老化公式: new = old * (253/256)*//** Decay the newidle max times here because this is a regular* visit to all the domains. Decay ~1% per second.*/if (time_after(jiffies, sd->next_decay_max_lb_cost)) {sd->max_newidle_lb_cost =(sd->max_newidle_lb_cost * 253) / 256;sd->next_decay_max_lb_cost = jiffies + HZ;need_decay = 1;}max_cost += sd->max_newidle_lb_cost;if (!(sd->flags & SD_LOAD_BALANCE))continue;#ifndef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT/* nohz CPU need GTS balance to migrate tasks for more than 2 clusters*//* Don't consider GTS balance if hybrid support */if (hybrid_support()) {if (sd->child || (!sd->child &&(rcu_dereference(per_cpu(sd_scs, cpu)) == NULL)))continue;}
#endif/* (4) 如果continue_balancing = 0,指示停止当前层级的load balance因为shed_group中其他的cpu正在这个层次做load_balance*//** Stop the load balance at this level. There is another* CPU in our sched group which is doing load balancing more* actively.*/if (!continue_balancing) {if (need_decay)continue;break;}/* (5) 计算当前层次schedule_domain的balance间隔时间 */interval = get_sd_balance_interval(sd, idle != CPU_IDLE);/* (6) 如果需要串行化(SD_SERIALIZE),做balance之前需要持锁 */need_serialize = sd->flags & SD_SERIALIZE;if (need_serialize) {if (!spin_trylock(&balancing))goto out;}/* (7) 如果本sd的balance间隔时间已到,进行实际的load_balance() */if (time_after_eq(jiffies, sd->last_balance + interval)) {if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {/** The LBF_DST_PINNED logic could have changed* env->dst_cpu, so we can't know our idle* state even if we migrated tasks. Update it.*/idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;}sd->last_balance = jiffies;interval = get_sd_balance_interval(sd, idle != CPU_IDLE);}if (need_serialize)spin_unlock(&balancing);
out:/* (8) 如果sd下一次balance时间在,rq的balance时间之前,需要更新rq的balance时间rq的下一次balance时间:next_balance  (默认是60s后)本sd的下一次balance时间:sd->last_balance + intervalrq的下一次balance时间需要选取多个sd中时间最近的一个*/if (time_after(next_balance, sd->last_balance + interval)) {next_balance = sd->last_balance + interval;update_next_balance = 1;}}if (need_decay) {/** Ensure the rq-wide value also decays but keep it at a* reasonable floor to avoid funnies with rq->avg_idle.*/rq->max_idle_balance_cost =max((u64)sysctl_sched_migration_cost, max_cost);}rcu_read_unlock();/* (8.1) 更新rq的balance时间 *//** next_balance will be updated only when there is a need.* When the cpu is attached to null domain for ex, it will not be* updated.*/if (likely(update_next_balance)) {rq->next_balance = next_balance;#ifdef CONFIG_NO_HZ_COMMON/** If this CPU has been elected to perform the nohz idle* balance. Other idle CPUs have already rebalanced with* nohz_idle_balance() and nohz.next_balance has been* updated accordingly. This CPU is now running the idle load* balance for itself and we need to update the* nohz.next_balance accordingly.*/if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))nohz.next_balance = rq->next_balance;
#endif}
}|→static int load_balance(int this_cpu, struct rq *this_rq,struct sched_domain *sd, enum cpu_idle_type idle,int *continue_balancing)
{int ld_moved, cur_ld_moved, active_balance = 0;struct sched_domain *sd_parent = sd->parent;struct sched_group *group;struct rq *busiest;unsigned long flags;struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);/* (7.1) 构造Load_balance需要的数据结构:.sd     = sd,   //本cpu在本tl层次的sd.dst_cpu    = this_cpu,   // 目的cpu是本cpu.dst_rq     = this_rq,    // 目的rq是本cpu的rq// load_balance的目的是找出负载最重的cpu,并将一部分负载迁移到本cpu上*/struct lb_env env = {.sd     = sd,.dst_cpu    = this_cpu,.dst_rq     = this_rq,.dst_grpmask    = sched_group_cpus(sd->groups),.idle       = idle,.loop_break = sched_nr_migrate_break,.cpus       = cpus,.fbq_type   = all,.tasks      = LIST_HEAD_INIT(env.tasks),};/** For NEWLY_IDLE load_balancing, we don't need to consider* other cpus in our group*/if (idle == CPU_NEWLY_IDLE)env.dst_grpmask = NULL;cpumask_copy(cpus, cpu_active_mask);schedstat_inc(sd, lb_count[idle]);redo:/* (7.2) check当前cpu是否适合作为dst_cpu(即light cpu,需要分担其他cpu的负载) */if (!should_we_balance(&env)) {*continue_balancing = 0;goto out_balanced;}/* (7.3) 找出本层级sched_group链表中,负载最重的(busiest)的sched_group */group = find_busiest_group(&env);if (!group) {schedstat_inc(sd, lb_nobusyg[idle]);goto out_balanced;}/* (7.4) 找出busiest sched_group中sched_group的rq,即负载最重cpu对应的rq */busiest = find_busiest_queue(&env, group);if (!busiest) {schedstat_inc(sd, lb_nobusyq[idle]);goto out_balanced;}BUG_ON(busiest == env.dst_rq);schedstat_add(sd, lb_imbalance[idle], env.imbalance);env.src_cpu = busiest->cpu;env.src_rq = busiest;ld_moved = 0;/* (7.5) 判断busiest cpu rq中的runnable进程数 > 1?至少有进程可以迁移走*/if (busiest->nr_running > 1) {/** Attempt to move tasks. If find_busiest_group has found* an imbalance but busiest->nr_running <= 1, the group is* still unbalanced. ld_moved simply stays zero, so it is* correctly treated as an imbalance.*/env.flags |= LBF_ALL_PINNED;env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);more_balance:raw_spin_lock_irqsave(&busiest->lock, flags);/* (7.6) 从busiest rq中detach进程, env->imbalance:需要迁移的负载大小cur_ld_moved:实际迁移的进程数*//** cur_ld_moved - load moved in current iteration* ld_moved     - cumulative load moved across iterations*/cur_ld_moved = detach_tasks(&env);/* (7.7) busiest cpu负载减轻后,在sched_freq中判断cpu频率是否可以调低*//** We want to potentially lower env.src_cpu's OPP.*/if (cur_ld_moved)update_capacity_of(env.src_cpu, SCHE_ONESHOT);/** We've detached some tasks from busiest_rq. Every* task is masked "TASK_ON_RQ_MIGRATING", so we can safely* unlock busiest->lock, and we are able to be sure* that nobody can manipulate the tasks in parallel.* See task_rq_lock() family for the details.*/raw_spin_unlock(&busiest->lock);/* (7.8) 把迁移过来的任务attack到dest_cpu上 */if (cur_ld_moved) {attach_tasks(&env);ld_moved += cur_ld_moved;}local_irq_restore(flags);/* (7.9) LBF_NEED_BREAK设置,说明balance还没有完成,循环只是出来休息一下,继续重新balance*/if (env.flags & LBF_NEED_BREAK) {env.flags &= ~LBF_NEED_BREAK;goto more_balance;}/* (7.10) 设置了LBF_DST_PINNED标志,并且env.imbalance > 0说明src_cpu上有些进程因为affinity的原因不能迁移到dst_cpu但是能迁移到同sg的new_dst_cpu上把dst_cpu更改为new_dst_cpu,重新开始balance流程*//** Revisit (affine) tasks on src_cpu that couldn't be moved to* us and move them to an alternate dst_cpu in our sched_group* where they can run. The upper limit on how many times we* iterate on same src_cpu is dependent on number of cpus in our* sched_group.** This changes load balance semantics a bit on who can move* load to a given_cpu. In addition to the given_cpu itself* (or a ilb_cpu acting on its behalf where given_cpu is* nohz-idle), we now have balance_cpu in a position to move* load to given_cpu. In rare situations, this may cause* conflicts (balance_cpu and given_cpu/ilb_cpu deciding* _independently_ and at _same_ time to move some load to* given_cpu) causing exceess load to be moved to given_cpu.* This however should not happen so much in practice and* moreover subsequent load balance cycles should correct the* excess load moved.*/if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {/* Prevent to re-select dst_cpu via env's cpus */cpumask_clear_cpu(env.dst_cpu, env.cpus);env.dst_rq   = cpu_rq(env.new_dst_cpu);env.dst_cpu  = env.new_dst_cpu;env.flags   &= ~LBF_DST_PINNED;env.loop     = 0;env.loop_break   = sched_nr_migrate_break;/** Go back to "more_balance" rather than "redo" since we* need to continue with same src_cpu.*/goto more_balance;}/* (7.11) 设置了LBF_SOME_PINNED标志,说明有些进程因为affinity迁移失败,  设置当前sd的parent sd的 sgc->imbalance,让parent sd做rebalance的概率增高*//** We failed to reach balance because of affinity.*/if (sd_parent) {int *group_imbalance = &sd_parent->groups->sgc->imbalance;if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)*group_imbalance = 1;}/* (7.12) 如果LBF_ALL_PINNED标志一直被置位,说明busiest_cpu因为affinity没有一个进程迁移成功,哪怕迁移到dst_cpu同sg的其他cpu也没有一个成功将busiest cpu从全局cpu mask去掉,重新做整个流程:find_busiest_group -> find_busiest_queue -> detach_tasks -> attach_tasks*//* All tasks on this runqueue were pinned by CPU affinity */if (unlikely(env.flags & LBF_ALL_PINNED)) {cpumask_clear_cpu(cpu_of(busiest), cpus);if (!cpumask_empty(cpus)) {env.loop = 0;env.loop_break = sched_nr_migrate_break;goto redo;}goto out_all_pinned;}}/* (7.13) 经过几轮的努力尝试,最终迁移的进程数ld_moved还是0,说明balance失败 */if (!ld_moved) {schedstat_inc(sd, lb_failed[idle]);/** Increment the failure counter only on periodic balance.* We do not want newidle balance, which can be very* frequent, pollute the failure counter causing* excessive cache_hot migrations and active balances.*/if (idle != CPU_NEWLY_IDLE)if (env.src_grp_nr_running > 1)sd->nr_balance_failed++;/* (7.14) 最后一次尝试迁移一个进程 */if (need_active_balance(&env)) {raw_spin_lock_irqsave(&busiest->lock, flags);/* (7.15) 如果当前cpu不在busiest->curr进程的affinity之内,返回失败 *//* don't kick the active_load_balance_cpu_stop,* if the curr task on busiest cpu can't be* moved to this_cpu*/if (!cpumask_test_cpu(this_cpu,tsk_cpus_allowed(busiest->curr))) {raw_spin_unlock_irqrestore(&busiest->lock,flags);env.flags |= LBF_ALL_PINNED;goto out_one_pinned;}/** ->active_balance synchronizes accesses to* ->active_balance_work.  Once set, it's cleared* only after active load balance is finished.*/if (!busiest->active_balance && !cpu_park(cpu_of(busiest))) {busiest->active_balance = 1; /* load_balance */busiest->push_cpu = this_cpu;active_balance = 1;}raw_spin_unlock_irqrestore(&busiest->lock, flags);/* (7.16) 迁移busiest->curr进程当前期cpu */if (active_balance) {if (stop_one_cpu_dispatch(cpu_of(busiest),active_load_balance_cpu_stop, busiest,&busiest->active_balance_work)) {raw_spin_lock_irqsave(&busiest->lock, flags);busiest->active_balance = 0;active_balance = 0;raw_spin_unlock_irqrestore(&busiest->lock, flags);}}/** We've kicked active balancing, reset the failure* counter.*/sd->nr_balance_failed = sd->cache_nice_tries+1;}} elsesd->nr_balance_failed = 0;if (likely(!active_balance)) {/* We were unbalanced, so reset the balancing interval */sd->balance_interval = sd->min_interval;} else {/** If we've begun active balancing, start to back off. This* case may not be covered by the all_pinned logic if there* is only 1 task on the busy runqueue (because we don't call* detach_tasks).*/if (sd->balance_interval < sd->max_interval)sd->balance_interval *= 2;}goto out;out_balanced:/** We reach balance although we may have faced some affinity* constraints. Clear the imbalance flag if it was set.*/if (sd_parent) {int *group_imbalance = &sd_parent->groups->sgc->imbalance;if (*group_imbalance)*group_imbalance = 0;}out_all_pinned:/** We reach balance because all tasks are pinned at this level so* we can't migrate them. Let the imbalance flag set so parent level* can try to migrate them.*/schedstat_inc(sd, lb_balanced[idle]);sd->nr_balance_failed = 0;out_one_pinned:/* tune up the balancing interval */if (((env.flags & LBF_ALL_PINNED) &&sd->balance_interval < MAX_PINNED_INTERVAL) ||(sd->balance_interval < sd->max_interval))sd->balance_interval *= 2;ld_moved = 0;
out:return ld_moved;
}||→static int should_we_balance(struct lb_env *env)
{struct sched_group *sg = env->sd->groups;struct cpumask *sg_cpus, *sg_mask;int cpu, balance_cpu = -1;/* (7.2.1) 如果本cpu为CPU_NEWLY_IDLE,直接符合迁移条件 *//** In the newly idle case, we will allow all the cpu's* to do the newly idle load balance.*/if (env->idle == CPU_NEWLY_IDLE)return 1;sg_cpus = sched_group_cpus(sg);sg_mask = sched_group_mask(sg);/* (7.2.2) 本sched_group的第一个idle cpu适合做load_balance *//* Try to find first idle cpu */for_each_cpu_and(cpu, sg_cpus, env->cpus) {if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))continue;balance_cpu = cpu;break;}/* (7.2.3) 没有idle cpu,则选取本sched_group的第一个cpu做load_balance */if (balance_cpu == -1)balance_cpu = group_balance_cpu(sg);/* (7.2.4) 不满足上述条件的cpu,不适合来启动load_balance *//** First idle cpu or the first cpu(busiest) in this sched group* is eligible for doing load balancing at this and above domains.*/return balance_cpu == env->dst_cpu;
}||→static struct sched_group *find_busiest_group(struct lb_env *env)
{struct sg_lb_stats *local, *busiest;struct sd_lb_stats sds;int local_cpu = 0, busiest_cpu = 0;struct cpumask *busiest_cpumask;int same_clus = 0;init_sd_lb_stats(&sds);/* (7.3.1) 更新本层级sched_group链表中,每个sched_group的负载,并选出busiest的一个sched_group*//** Compute the various statistics relavent for load balancing at* this level.*/update_sd_lb_stats(env, &sds);local = &sds.local_stat;busiest = &sds.busiest_stat;if (sds.busiest) {busiest_cpumask = sched_group_cpus(sds.busiest);local_cpu = env->dst_cpu;busiest_cpu = group_first_cpu(sds.busiest);same_clus = is_the_same_domain(local_cpu, busiest_cpu);mt_sched_printf(sched_lb, "%s: local_cpu=%d, busiest_cpu=%d, busiest_mask=%lu, same_cluster=%d",__func__, local_cpu, busiest_cpu, busiest_cpumask->bits[0], same_clus);}/* (7.3.2) 如果EAS使能,跨cluster的任务迁移使用EAS来做 */if (energy_aware() && !env->dst_rq->rd->overutilized && !same_clus)goto out_balanced;/* (7.3.3) *//* ASYM feature bypasses nice load balance check */if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&check_asym_packing(env, &sds))return sds.busiest;/* (7.3.4) busiest sg上没有负载,返回空 *//* There is no busy sibling group to pull tasks from */if (!sds.busiest || busiest->sum_nr_running == 0) {if (!sds.busiest)mt_sched_printf(sched_lb, "[%s] %d: fail no busiest ", __func__, env->src_cpu);elsemt_sched_printf(sched_lb, "[%s] %d: fail busiest no task ", __func__, env->src_cpu);goto out_balanced;}/* (7.3.5) sg链表里的平均负载 */sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)/ sds.total_capacity;/* (7.3.6) 如果busiest sg低一级别的因为cpu affinity没有balance成功,设置了group_imbalanced标志 强制在当前级别上进行balance*//** If the busiest group is imbalanced the below checks don't* work because they assume all things are equal, which typically* isn't true due to cpus_allowed constraints and the like.*/if (busiest->group_type == group_imbalanced)goto force_balance;/* (7.3.7) 如果dest cpu/group很闲,busiest负载很重,  强制开展balance*//* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&busiest->group_no_capacity)goto force_balance;/* (7.3.8)  如果dest_cpu所在sg的负载都大于busiest sg的负载,放弃balance*//** If the local group is busier than the selected busiest group* don't try and pull any tasks.*/if (local->avg_load >= busiest->avg_load)goto out_balanced;/* (7.3.9)  如果dest_cpu所在sg的负载都大于sg链表的平均负载,放弃balance*//** Don't pull any tasks if this group is already above the domain* average load.*/if (local->avg_load >= sds.avg_load)goto out_balanced;/* (7.3.10)  如果dest_cpu为idle,但是dest_cpu所在的sg idle cpu数量小于busiest sg的idle cpu数量放弃balance*/
#ifdef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENTif ((env->idle == CPU_IDLE) || (env->idle == CPU_NEWLY_IDLE)) {int i = (env->idle == CPU_IDLE) ? 1:0;
#elseif (env->idle == CPU_IDLE) {
#endif/** This cpu is idle. If the busiest group is not overloaded* and there is no imbalance between this and busiest group* wrt idle cpus, it is balanced. The imbalance becomes* significant if the diff is greater than 1 otherwise we* might end up to just move the imbalance on another group*/
#ifdef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENTif ((busiest->group_type != group_overloaded) &&(local->idle_cpus < (busiest->idle_cpus + i)))
#elseif ((busiest->group_type != group_overloaded) &&(local->idle_cpus <= (busiest->idle_cpus + 1)))
#endifgoto out_balanced;} else {/* (7.3.11)  busiest->avg_load大于local->avg_load的比例没有超过env->sd->imbalance_pct放弃balance*//** In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use* imbalance_pct to be conservative.*/if (100 * busiest->avg_load <=env->sd->imbalance_pct * local->avg_load)goto out_balanced;}force_balance:/* Looks like there is an imbalance. Compute it *//* (7.3.12) 计算需要迁移的负载值env->imbalance */calculate_imbalance(env, &sds);
#ifdef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENTenv->imbalance = env->imbalance * SCHED_CAPACITY_SCALE/ (sds.busiest->sgc->capacity / cpumask_weight(sched_group_cpus(sds.busiest)));
#endifreturn sds.busiest;out_balanced:env->imbalance = 0;return NULL;
}|||→static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{struct sched_domain *child = env->sd->child;struct sched_group *sg = env->sd->groups;struct sg_lb_stats tmp_sgs;int load_idx, prefer_sibling = 0;bool overload = false, overutilized = false;if (child && child->flags & SD_PREFER_SIBLING)prefer_sibling = 1;/* (7.3.1.1) 根据idle情况,选择计算cpu负载时的idx,idx:是CPU层级负载this_rq->cpu_load[i]数组的index值*/load_idx = get_sd_load_idx(env->sd, env->idle);/* (7.3.1.2) 逐个轮询本层级sched_group链表中的每个sched_group */do {struct sg_lb_stats *sgs = &tmp_sgs;int local_group;/* (7.3.1.3) 如果sg是当前cpu所在的sg,则本sg称为local_group 使用专门的数据结构来存储local_group的信息:sds->local = sg;        // 使用sds->local来存储local_groupsgs = &sds->local_stat; // 使用sds->local_stat来存储local_group的统计*/local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));if (local_group) {sds->local = sg;sgs = &sds->local_stat;/* (7.3.1.4) 更新local_group的capacity,更新的周期为sd->balance_interval 主要目的是动态减去RT进程消耗的capacity*/if (env->idle != CPU_NEWLY_IDLE ||time_after_eq(jiffies, sg->sgc->next_update))update_group_capacity(env->sd, env->dst_cpu);}/* (7.3.1.5) 更新当前sched_group的负载统计 sgs:sg统计数据放到sgs当中overload:rq中runnable的进程>1,那么肯定有进程在等待overutilized:cpu的capacity < util,运算能力不足*/update_sg_lb_stats(env, sg, load_idx, local_group, sgs,&overload, &overutilized);/* (7.3.1.6) local_group不参与busiest sg的计算 */if (local_group)goto next_group;/* (7.3.1.7) 如果设置了SD_PREFER_SIBLING标志,说明local_group希望其他人迁移任务到它身上,提高其他sg的迁移优先级*//** In case the child domain prefers tasks go to siblings* first, lower the sg capacity so that we'll try* and move all the excess tasks away. We lower the capacity* of a group only if the local group has the capacity to fit* these excess tasks. The extra check prevents the case where* you always pull from the heaviest group when it is already* under-utilized (possible with a large weight task outweighs* the tasks on the system).*/if (prefer_sibling && sds->local &&group_has_capacity(env, &sds->local_stat) &&(sgs->sum_nr_running > 1)) {sgs->group_no_capacity = 1;sgs->group_type = group_classify(sg, sgs);}/* (7.3.1.8) 根据计算的sgs统计数据,找出busiest sg */if (update_sd_pick_busiest(env, sds, sg, sgs)) {sds->busiest = sg;sds->busiest_stat = *sgs;}next_group:/* (7.3.1.9) 更新sds中的负载、capacity统计 *//* Now, start updating sd_lb_stats */sds->total_load += sgs->group_load;sds->total_capacity += sgs->group_capacity;sg = sg->next;} while (sg != env->sd->groups);if (env->sd->flags & SD_NUMA)env->fbq_type = fbq_classify_group(&sds->busiest_stat);env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;/* (7.3.1.10) 根据最后一个sg的overload、overutilized值来更新dst_cpu rq->rd中的对应值 。ooooo这里是怎么想的?不是local_group,也不是busiest_group,而是最后一个计算的sg!!!*/if (!env->sd->parent) {/* update overload indicator if we are at root domain */if (env->dst_rq->rd->overload != overload)env->dst_rq->rd->overload = overload;/* Update over-utilization (tipping point, U >= 0) indicator */if (env->dst_rq->rd->overutilized != overutilized)env->dst_rq->rd->overutilized = overutilized;} else {if (!env->dst_rq->rd->overutilized && overutilized)env->dst_rq->rd->overutilized = true;}
}||||→static inline void update_sg_lb_stats(struct lb_env *env,struct sched_group *group, int load_idx,int local_group, struct sg_lb_stats *sgs,bool *overload, bool *overutilized)
{unsigned long load;int i;memset(sgs, 0, sizeof(*sgs));/*  (7.3.1.5.1) 遍历sched_group中的每个cpu */for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {struct rq *rq = cpu_rq(i);/* (7.3.1.5.2) 获取本cpu的负载rq->cpu_load[load_idx-1] *//* Bias balancing toward cpus of our domain */if (local_group)/* 如果是local_group,负载往小的取:min(rq->cpu_load[load_idx-1], weighted_cpuload(cpu)) */load = target_load(i, load_idx);else/* 如果不是local_group,负载往大的取:max(rq->cpu_load[load_idx-1], weighted_cpuload(cpu)) */load = source_load(i, load_idx);#ifdef CONFIG_MTK_SCHED_INTEROP/* (7.3.1.5.3) 因为rq->cpu_load[]只包含cfs的负载,mtk尝试加上rt部分的负载ooooo但是rq->cpu_capacity中已经减去了rt的部分,这里是否还需要??*/load += mt_rt_load(i);
#endif/* (7.3.1.5.4) 累加sgs各项值:sgs->group_load   // runnable负载带weight分量(cpu_rq(cpu)->cfs.avg.util_avg),经过rq->cpu_load[]计算sgs->group_util   // running负载(cpu_rq(cpu)->cfs.avg.load_avg/cpu_rq(cpu)->cfs.runnable_load_avg)sgs->sum_nr_running // rq中所有se的总和sgs->sum_weighted_load // runnable负载带weight分量(cpu_rq(cpu)->cfs.avg.util_avg)sgs->idle_cpus      // idle状态的cpu计数*/
#ifdef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENTsgs->group_load += (load * capacity_orig_of(i)) >> SCHED_CAPACITY_SHIFT;
#elsesgs->group_load += load;
#endifsgs->group_util += cpu_util(i);sgs->sum_nr_running += rq->cfs.h_nr_running;/* (7.3.1.5.5) 如果rq中进程数量>1,则就会有进程处于runnable状态,overload = true*/if (rq->nr_running > 1)*overload = true;#ifdef CONFIG_NUMA_BALANCINGsgs->nr_numa_running += rq->nr_numa_running;sgs->nr_preferred_running += rq->nr_preferred_running;
#endifsgs->sum_weighted_load += weighted_cpuload(i);if (idle_cpu(i))sgs->idle_cpus++;/* (7.3.1.5.6) cpu的capacity小于cpu的running状态负载,overutilized = true*/if (cpu_overutilized(i))*overutilized = true;}/* (7.3.1.5.7) 更新汇总后sgs的统计数据:sgs->group_capacity     // sgs所有cpu capacity的累加sgs->avg_load           // 按照group_capacity,等比例放大group_load负载,capacity越小avg_load越大sgs->load_per_task      // sgs的平均每个进程的weight负载sgs->group_weight       // sgs的online cpu个数sgs->group_no_capacity  // sgs的capacity已经不够用,赶不上utilsgs->group_type         // 严重级别 group_overloaded > group_imbalanced > group_other// group_imbalanced: 下一等级的load_balance因为cpu_affinity的原因没有完成*//* Adjust by relative CPU capacity of the group */sgs->group_capacity = group->sgc->capacity;sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;if (sgs->sum_nr_running)sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;sgs->group_weight = group->group_weight;sgs->group_no_capacity = group_is_overloaded(env, sgs);sgs->group_type = group_classify(group, sgs);
}||||→static bool update_sd_pick_busiest(struct lb_env *env,struct sd_lb_stats *sds,struct sched_group *sg,struct sg_lb_stats *sgs)
{struct sg_lb_stats *busiest = &sds->busiest_stat;#ifdef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENTif (sgs->sum_nr_running == 0) {mt_sched_printf(sched_lb_info, "[%s] sgs->sum_nr_running=%d",__func__, sgs->sum_nr_running);return false;}
#endif/* (7.3.1.9.1) 如果新的sgs group_type大于旧的busiest sgs,新的sgs更busy*/if (sgs->group_type > busiest->group_type)return true;/* (7.3.1.9.2) 如果新的sgs group_type小于旧的busiest sgs,旧的sgs更busy*/if (sgs->group_type < busiest->group_type)return false;/* (7.3.1.9.3) 在group_type相同的情况下,比较sgs->avg_load sgs->avg_load = rq->cpu_load[load_idx-1] * (group_load*SCHED_CAPACITY_SCALE / sgs->group_capacity)*/if (sgs->avg_load <= busiest->avg_load)return false;/* (7.3.1.9.4) 如果SD_ASYM_PACKING标志没有置位,在group_type相同的情况下,sgs->avg_load值较大的为busiest sg*//* This is the busiest node in its class. */if (!(env->sd->flags & SD_ASYM_PACKING))return true;/* (7.3.1.9.5) ASYM_PACKING的意思是会把负载移到最低序号的cpu上,如果sg的frist cpu序号 > dst_cpu,则busiest对个sg的frist cpu序号 > dst_cpu,选择序号小的sg*//** ASYM_PACKING needs to move all the work to the lowest* numbered CPUs in the group, therefore mark all groups* higher than ourself as busy.*/if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {if (!sds->busiest)return true;if (group_first_cpu(sds->busiest) > group_first_cpu(sg))return true;}/* (7.3.1.9.6) 设置了ASYM_PACKING,且如果sg的frist cpu序号 <= dst_cpu,返回false*/return false;
}|||→static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{unsigned long max_pull, load_above_capacity = ~0UL;struct sg_lb_stats *local, *busiest;/* (7.3.12.1) local sgs和busiest sgs */local = &sds->local_stat;busiest = &sds->busiest_stat;if (busiest->group_type == group_imbalanced) {/** In the group_imb case we cannot rely on group-wide averages* to ensure cpu-load equilibrium, look at wider averages. XXX*/busiest->load_per_task =min(busiest->load_per_task, sds->avg_load);}/* (7.3.12.2) *//** In the presence of smp nice balancing, certain scenarios can have* max load less than avg load(as we skip the groups at or below* its cpu_capacity, while calculating max_load..)*/if (busiest->avg_load <= sds->avg_load ||local->avg_load >= sds->avg_load) {env->imbalance = 0;return fix_small_imbalance(env, sds);}/* (7.3.12.3) *//** If there aren't any idle cpus, avoid creating some.*/if (busiest->group_type == group_overloaded &&local->group_type   == group_overloaded) {load_above_capacity = busiest->sum_nr_running *SCHED_LOAD_SCALE;if (load_above_capacity > busiest->group_capacity)load_above_capacity -= busiest->group_capacity;elseload_above_capacity = ~0UL;}/* (7.3.12.4) env->imbalance的值等于min((sds->avg - local), (busiest - sds->avg))在local和sds平均值,busiest和sds平均值,两个差值之间选择最小值*//** We're trying to get all the cpus to the average_load, so we don't* want to push ourselves above the average load, nor do we wish to* reduce the max loaded cpu below the average load. At the same time,* we also don't want to reduce the group load below the group capacity* (so that we can implement power-savings policies etc). Thus we look* for the minimum possible imbalance.*/max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);/* How much load to actually move to equalise the imbalance */env->imbalance = min(max_pull * busiest->group_capacity,(sds->avg_load - local->avg_load) * local->group_capacity) / SCHED_CAPACITY_SCALE;/** if *imbalance is less than the average load per runnable task* there is no guarantee that any tasks will be moved so we'll have* a think about bumping its value to force at least one task to be* moved*/if (env->imbalance < busiest->load_per_task)return fix_small_imbalance(env, sds);
}||→static struct rq *find_busiest_queue(struct lb_env *env,struct sched_group *group)
{struct rq *busiest = NULL, *rq;unsigned long busiest_load = 0, busiest_capacity = 1;int i;/* (7.4.1) 逐个遍历sg中的cpu */for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {unsigned long capacity, wl;enum fbq_type rt;rq = cpu_rq(i);rt = fbq_classify_rq(rq);/** We classify groups/runqueues into three groups:*  - regular: there are !numa tasks*  - remote:  there are numa tasks that run on the 'wrong' node*  - all:     there is no distinction** In order to avoid migrating ideally placed numa tasks,* ignore those when there's better options.** If we ignore the actual busiest queue to migrate another* task, the next balance pass can still reduce the busiest* queue by moving tasks around inside the node.** If we cannot move enough load due to this classification* the next pass will adjust the group classification and* allow migration of more tasks.** Both cases only affect the total convergence complexity.*/if (rt > env->fbq_type)continue;/* (7.4.2) 计算出cpu的capacity和weight_load */capacity = capacity_of(i);wl = weighted_cpuload(i);#ifdef CONFIG_MTK_SCHED_INTEROPwl += mt_rt_load(i);
#endif/** When comparing with imbalance, use weighted_cpuload()* which is not scaled with the cpu capacity.*/if (rq->nr_running == 1 && wl > env->imbalance &&!check_cpu_capacity(rq, env->sd))continue;/* (7.4.3) 选出相对负载最重的cpu *//** For the load comparisons with the other cpu's, consider* the weighted_cpuload() scaled with the cpu capacity, so* that the load can be moved away from the cpu that is* potentially running at a lower capacity.** Thus we're looking for max(wl_i / capacity_i), crosswise* multiplication to rid ourselves of the division works out* to: wl_i * capacity_j > wl_j * capacity_i;  where j is* our previous maximum.*/if (wl * busiest_capacity > busiest_load * capacity) {busiest_load = wl;busiest_capacity = capacity;busiest = rq;}}return busiest;
}||→static int detach_tasks(struct lb_env *env)
{struct list_head *tasks = &env->src_rq->cfs_tasks;struct task_struct *p;unsigned long load;int detached = 0;lockdep_assert_held(&env->src_rq->lock);if (env->imbalance <= 0)return 0;/* (7.6.1) 遍历busiest rq中的进程 */while (!list_empty(tasks)) {/* (7.6.2) 如果dest cpu不是idle,不能将busiest cpu迁移到idle状态 */    /** We don't want to steal all, otherwise we may be treated likewise,* which could at worst lead to a livelock crash.*/if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)break;p = list_first_entry(tasks, struct task_struct, se.group_node);/* (7.6.3) 遍历任务最多不超过sysctl_sched_nr_migrate(32) */env->loop++;/* We've more or less seen every task there is, call it quits */if (env->loop > env->loop_max)break;/* (7.6.4) 每sched_nr_migrate_break个任务遍历需要跳出休息一下,如果没有达到env->loop_max,后面会重来*//* take a breather every nr_migrate tasks */if (env->loop > env->loop_break) {env->loop_break += sched_nr_migrate_break;env->flags |= LBF_NEED_BREAK;break;}/* (7.6.5) 判断任务是否支持迁移? */if (!can_migrate_task(p, env))goto next;/* (7.6.6) 获取p进程相对顶层cfs_rq的负载, 根据负载判断进程是否适合迁移*/load = task_h_load(p);if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)goto next;if ((load / 2) > env->imbalance)goto next;/* (7.6.7) detach 进程 */detach_task(p, env);list_add(&p->se.group_node, &env->tasks);detached++;env->imbalance -= load;#ifdef CONFIG_PREEMPT/** NEWIDLE balancing is a source of latency, so preemptible* kernels will stop after the first task is detached to minimize* the critical section.*/if (env->idle == CPU_NEWLY_IDLE)break;
#endif/** We only want to steal up to the prescribed amount of* weighted load.*/if (env->imbalance <= 0)break;continue;
next:list_move_tail(&p->se.group_node, tasks);}/** Right now, this is one of only two places we collect this stat* so we can safely collect detach_one_task() stats here rather* than inside detach_one_task().*/schedstat_add(env->sd, lb_gained[env->idle], detached);return detached;
}|||→static
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{int tsk_cache_hot;lockdep_assert_held(&env->src_rq->lock);/** We do not migrate tasks that are:* 1) throttled_lb_pair, or* 2) cannot be migrated to this CPU due to cpus_allowed, or* 3) running (obviously), or* 4) are cache-hot on their current CPU.*//* (7.6.5.1) 如果达到bandwith限制,返回失败 */if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))return 0;/* (7.6.5.2) 如果p进程的cpu affinity不允许迁移到dst_cpu,进一步处理 */if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {int cpu;schedstat_inc(p, se.statistics.nr_failed_migrations_affine);/* (7.6.5.3) LBF_SOME_PINNED标志,记录有些进程迁移失败 */env->flags |= LBF_SOME_PINNED;/* (7.6.5.5) 如果已经有其他的LBF_DST_PINNED动作,直接返回失败 *//** Remember if this task can be migrated to any other cpu in* our sched_group. We may want to revisit it if we couldn't* meet load balance goals by pulling other tasks on src_cpu.** Also avoid computing new_dst_cpu if we have already computed* one in current iteration.*/if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))return 0;/* (7.6.5.4) 如果dst_cpu同一sched_group中的其他cpu符合p的affinity,尝试更改dst_cpu,设置LBF_DST_PINNED标志*//* Prevent to re-select dst_cpu via env's cpus */for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {env->flags |= LBF_DST_PINNED;env->new_dst_cpu = cpu;break;}}return 0;}/* (7.6.5.6) 有任何符合affinity条件的p,清除LBF_ALL_PINNED标志 *//* Record that we found atleast one task that could run on dst_cpu */env->flags &= ~LBF_ALL_PINNED;/* (7.6.5.7) 如果p在running状态,返回失败 */if (task_running(env->src_rq, p)) {schedstat_inc(p, se.statistics.nr_failed_migrations_running);return 0;}/* (7.6.5.8) NUMA 相关的一些判断  *//** Aggressive migration if:* 1) destination numa is preferred* 2) task is cache cold, or* 3) too many balance attempts have failed.*/tsk_cache_hot = migrate_degrades_locality(p, env);if (tsk_cache_hot == -1)tsk_cache_hot = task_hot(p, env);if (tsk_cache_hot <= 0 ||env->sd->nr_balance_failed > env->sd->cache_nice_tries) {if (tsk_cache_hot == 1) {schedstat_inc(env->sd, lb_hot_gained[env->idle]);schedstat_inc(p, se.statistics.nr_forced_migrations);}return 1;}schedstat_inc(p, se.statistics.nr_failed_migrations_hot);return 0;
}|||→static unsigned long task_h_load(struct task_struct *p)
{struct cfs_rq *cfs_rq = task_cfs_rq(p);update_cfs_rq_h_load(cfs_rq);/* (7.6.6.1) task_h_load的目的是在task_group使能时,rq中有多个层次的cfs_rq 如果进程p挂载在底层的cfs_rq中,把p的负载转换成顶层cfs_rq的相对负载*/return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,cfs_rq_load_avg(cfs_rq) + 1);
}static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
{struct rq *rq = rq_of(cfs_rq);struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];u64 now = sched_clock_cpu(cpu_of(rq));unsigned long load;/* sched: change to jiffies */now = now * HZ >> 30;if (cfs_rq->last_h_load_update == now)return;/* 从底层cfs_rq到顶层cfs_rq建立起层次关系 */cfs_rq->h_load_next = NULL;for_each_sched_entity(se) {cfs_rq = cfs_rq_of(se);cfs_rq->h_load_next = se;if (cfs_rq->last_h_load_update == now)break;}if (!se) {cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);cfs_rq->last_h_load_update = now;}/* 使用建立的关系,从顶层cfs_rq开始计算每个层次cfs_rq的相对顶层负载h_load */while ((se = cfs_rq->h_load_next) != NULL) {load = cfs_rq->h_load;load = div64_ul(load * se->avg.load_avg,cfs_rq_load_avg(cfs_rq) + 1);cfs_rq = group_cfs_rq(se);cfs_rq->h_load = load;cfs_rq->last_h_load_update = now;}
}

4.1.2.2、nohz_idle_balance()

每个cpu的负载均衡是在本cpu的tick任务scheduler_tick()中判断执行的,如果cpu进入了nohz模式scheduler_tick()被stop,那么本cpu没有机会去做rebalance_domains()。为了解决这个问题,系统设计了nohz_idle_balance(),在运行的cpu上判断进入nohz的cpu是否需要rebalance load,如果需要选择一个idle cpu来帮所有的nohz idle cpu做负载均衡。

在rebalance_domains()函数之前有一个nohz_idle_balance(),这是系统在条件满足的情况下让一个idle cpu做idle负载均衡。主要的原理如下:

  • 1、cpu在进入nohz idle状态时,设置标志:

tick_nohz_idle_enter() -> set_cpu_sd_state_idle():↓void set_cpu_sd_state_idle(void)
{struct sched_domain *sd;int cpu = smp_processor_id();rcu_read_lock();sd = rcu_dereference(per_cpu(sd_busy, cpu));if (!sd || sd->nohz_idle)goto unlock;/* (1.1) 进入nohz idle,设置sd->nohz_idle标志 */sd->nohz_idle = 1;/* (1.2) 减少sgc->nr_busy_cpus的计数 */atomic_dec(&sd->groups->sgc->nr_busy_cpus);
unlock:rcu_read_unlock();
}tick_nohz_idle_enter() -> __tick_nohz_idle_enter() -> tick_nohz_stop_sched_tick() -> nohz_balance_enter_idle():↓void nohz_balance_enter_idle(int cpu)
{/** If this cpu is going down, then nothing needs to be done.*/if (!cpu_active(cpu))return;if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))return;/** If we're a completely isolated CPU, we don't play.*/if (on_null_domain(cpu_rq(cpu)))return;/* (2.1) 进入idle状态,设置nohz.idle_cpus_mask中对应的bit */cpumask_set_cpu(cpu, nohz.idle_cpus_mask);/* (2.2) 进入idle状态,增加nohz.nr_cpus计数 */atomic_inc(&nohz.nr_cpus);/* (2.3) 设置cpu_rq(cpu)->nohz_flags中的NOHZ_TICK_STOPPED标志 */set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
}
  • 2、在trigger_load_balance()中判断,当前是否需要触发idle load balance:

void trigger_load_balance(struct rq *rq)
{/* (1) 判断当前是否需要idle load balance */if (nohz_kick_needed(rq))/* (2) 选中一个idle cpu去做idle load balance */nohz_balancer_kick();}|→/** Current heuristic for kicking the idle load balancer in the presence* of an idle cpu in the system.*   - This rq has more than one task.*   - This rq has at least one CFS task and the capacity of the CPU is*     significantly reduced because of RT tasks or IRQs.*   - At parent of LLC scheduler domain level, this cpu's scheduler group has*     multiple busy cpu.*   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler*     domain span are idle.*/
static inline bool nohz_kick_needed(struct rq *rq)
{unsigned long now = jiffies;struct sched_domain *sd;struct sched_group_capacity *sgc;int nr_busy, cpu = rq->cpu;bool kick = false;/* (1.1) 如果当前cpu为idle状态,失败退出 */if (unlikely(rq->idle_balance))return false;/* (1.2) 退出nohz状态:set_cpu_sd_state_busy()、nohz_balance_exit_idle(cpu)是set_cpu_sd_state_idle()、nohz_balance_enter_idle()的反向操作*//** We may be recently in ticked or tickless idle mode. At the first* busy tick after returning from idle, we will update the busy stats.*/set_cpu_sd_state_busy();nohz_balance_exit_idle(cpu);/* (1.3) 如果进入nohz idle状态的cpu数量为0,失败退出 *//** None are in tickless mode and hence no need for NOHZ idle load* balancing.*/if (likely(!atomic_read(&nohz.nr_cpus)))return false;/* (1.4) nohz balance时间未到,失败退出 */if (time_before(now, nohz.next_balance))return false;#if !defined(CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT) && defined(CONFIG_HMP)/* for more than two clusters, still need wakup nohz CPUs and force balancing *//** Bail out if there are no nohz CPUs in our* HMP domain, since we will move tasks between* domains through wakeup and force balancing* as necessary based upon task load.*/if (sched_feat(SCHED_HMP) && cpumask_first_and(nohz.idle_cpus_mask,&((struct hmp_domain *)hmp_cpu_domain(cpu))->cpus) >= nr_cpu_ids)return false;
#endif/* (1.5) 当前cpu的进程>=2,返回成功 */if (rq->nr_running >= 2 &&(!energy_aware() || cpu_overutilized(cpu)))return true;/* (1.6) sd所在sg的nr_busy_cpus>1,返回成功 */rcu_read_lock();sd = rcu_dereference(per_cpu(sd_busy, cpu));if (sd && !energy_aware()) {sgc = sd->groups->sgc;nr_busy = atomic_read(&sgc->nr_busy_cpus);if (nr_busy > 1) {kick = true;goto unlock;}}/* (1.7) 如果所有层次的se个数>=1,且capacity在减少,返回成功 */sd = rcu_dereference(rq->sd);if (sd) {if ((rq->cfs.h_nr_running >= 1) &&check_cpu_capacity(rq, sd)) {kick = true;goto unlock;}}/* (1.8) 如果本sd->span[]中第一个idle cpu < sd_asym,返回成功 */sd = rcu_dereference(per_cpu(sd_asym, cpu));if (sd && (cpumask_first_and(nohz.idle_cpus_mask,sched_domain_span(sd)) < cpu)) {kick = true;goto unlock;}unlock:rcu_read_unlock();return kick;
}|→static void nohz_balancer_kick(void)
{int ilb_cpu;nohz.next_balance++;/* (2.1) 找到所有idle cpu中的第一个idle cpu */ilb_cpu = find_new_ilb();if (ilb_cpu >= nr_cpu_ids)return;/* (2.2) 给ilb_cpu的cpu_rq(cpu)->nohz_flags设置NOHZ_BALANCE_KICK标志位 */if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))return;/* (2.3) 使用ipi中断来唤醒ilb_cpu执行idle load balance *//** Use smp_send_reschedule() instead of resched_cpu().* This way we generate a sched IPI on the target cpu which* is idle. And the softirq performing nohz idle load balance* will be run before returning from the IPI.*/smp_send_reschedule(ilb_cpu);return;
}/* (2.3.1) ilb_cpu倍唤醒后处理IPI_RESCHEDULE,会触发一个SCHED_SOFTIRQ软中断来启动run_rebalance_domains()*/void handle_IPI(int ipinr, struct pt_regs *regs)
{unsigned int cpu = smp_processor_id();struct pt_regs *old_regs = set_irq_regs(regs);if ((unsigned)ipinr < NR_IPI) {trace_ipi_entry_rcuidle(ipi_types[ipinr]);__inc_irq_stat(cpu, ipi_irqs[ipinr]);}switch (ipinr) {case IPI_RESCHEDULE:scheduler_ipi();break;}↓void scheduler_ipi(void)
{/** Check if someone kicked us for doing the nohz idle load balance.*/if (unlikely(got_nohz_idle_kick())) {this_rq()->idle_balance = 1;raise_softirq_irqoff(SCHED_SOFTIRQ);}}
  • 3、被选中的ilb_cpu被唤醒后,需要帮其他所有idle cpu完成rebalance_domains()工作:

static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{int this_cpu = this_rq->cpu;struct rq *rq;int balance_cpu;/* Earliest time when we have to do rebalance again */unsigned long next_balance = jiffies + 60*HZ;int update_next_balance = 0;/* (1) 判断当前cpu是不是被选中被唤醒的ilb_cpu */if (idle != CPU_IDLE ||!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))goto end;/* (2) 轮询所有进入onhz状态的cpu */for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {/* (3) 只服务非本cpu,且还是idle状态的cpu ooooo本cpu也是idle状态,不需对本cpu做idle负载均衡?ooooo给其他idle cpu的rq做了负载均衡后,什么时候唤醒其他idle cpu?*/if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))continue;/* (4) 如果本cpu被设置了resched标志,说明有线程被唤醒,退出idle状态 *//** If this cpu gets work to do, stop the load balancing* work being done for other cpus. Next load* balancing owner will pick it up.*/if (need_resched())break;/* (5) 需要做负载均衡的idle进程balance_cpu */rq = cpu_rq(balance_cpu);/* (6) 如果balance_cpu的rq->next_balance时间已到,替其做rebalance_domains() *//** If time for next balance is due,* do the balance.*/if (time_after_eq(jiffies, rq->next_balance)) {raw_spin_lock_irq(&rq->lock);update_rq_clock(rq);/* (7) 更新idle cpu因为idle造成的负载衰减 */update_idle_cpu_load(rq);raw_spin_unlock_irq(&rq->lock);/* (8) 对balance_cpu做负载均衡 ooooo做完负载均衡,什么时候唤醒balance_cpu??*/rebalance_domains(rq, CPU_IDLE);}if (time_after(next_balance, rq->next_balance)) {next_balance = rq->next_balance;update_next_balance = 1;}}/* (9) 根据所有进入nohz idle cpu rq的最近的一次到期时间,更新nohz.next_balance *//** next_balance will be updated only when there is a need.* When the CPU is attached to null domain for ex, it will not be* updated.*/if (likely(update_next_balance))nohz.next_balance = next_balance;
end:clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
}

4.1.2.3、select_task_rq_fair()

除了scheduler_tick()的时候会做负载均衡,另外一个时刻也会做负载均衡。就是fork新进程、wakeup休眠进程时,系统会根据负载均衡挑选一个最合适的cpu给进程运行,其核心函数就是select_task_rq_fair():

  • 1、首先是使用EAS的方法来select_cpu,在EAS使能且没有overutilized时使用EAS方法:

需要重点提一下的是:负载计算计算了3种负载(load_avg、loadwop_avg、util_avg),EAS主要使用其中的util_avg,和capacity一起计算。

  • 1.1、EAS遍历cluster和cpu,找到一个既能满足进程p的affinity又能容纳下进程p的负载util,属于能用最小capacity满足的cluster其中剩余capacity最多的target_cpu;

首先找到能容纳进程p的util且capacity最小的cluster:

然后在目标cluster中找到加上进程p以后,剩余capacity最大的cpu:

pre_cpu是进程p上一次运行的cpu作为src_cpu,上面选择的target_cpu作为dst_cpu,就是尝试计算进程p从pre_cpu迁移到target_cpu系统的功耗差异:

  • 1.2、计算负载变化前后,target_cpu和prev_cpu带来的power变化。如果没有power增加则返回target_cpu,如果有power增加则返回prev_cpu;

计算负载变化的函数energy_diff()循环很多比较复杂,仔细分析下来就是计算target_cpu/prev_cpu在“MC层次cpu所在sg链表”+“DIE层级cpu所在sg”,这两种范围在负载变化中的功耗差异:

energy_diff()的计算方法如下:

负载值 计算方法 说明
idle_idx min(rq->idle_state_idx) sg多个cpu中,idle_state_idx最小值
eenv->cap_idx find_new_capacity() 在负载变化后,根据sg多个cpu中的最大util值,匹配的cpu freq档位sg->sge->cap_states[eenv->cap_idx].cap
group_util += (__cpu_util << SCHED_CAPACITY_SHIFT)/sg->sge->cap_states[eenv->cap_idx].cap 累加sg中cpu的util值,并且把util转换成capacity的反比
sg_busy_energy (group_util * sg->sge->busy_power(group_first_cpu(sg), eenv, (sd->child) ? 1 : 0)) >> SCHED_CAPACITY_SHIFT 使用group_util计算busy部分消耗的功耗
sg_idle_energy ((SCHED_LOAD_SCALE - group_util) * sg->sge->idle_power(idle_idx, group_first_cpu(sg), eenv, (sd->child) ? 1 : 0)) >> SCHED_CAPACITY_SHIFT 使用(SCHED_LOAD_SCALE - group_util)计算idle部分计算的功耗
total_energy sg_busy_energy + sg_idle_energy 单个sg的功耗,累计所有相关sg的功耗,总的差异就是进程P迁移以后的功耗差异
  • 2、如果EAS不适应,使用传统的负载均衡方法来select_cpu:
  • 2.1、find_idlest_group() -> find_idlest_cpu() 找出最时候的target_cpu;
  • 2.2、最差的方法使用select_idle_sibling()讲究找到一个idle cpu作为target_cpu;
  • 2.3、确定target_cpu后,继续使用hmp_select_task_rq_fair()来判断是否需要进行hmp迁移;
static int
select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
{struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;int cpu = smp_processor_id();int new_cpu = prev_cpu;  /* 默认new_cpu为prev_cpu */int want_affine = 0;int sync = wake_flags & WF_SYNC;int policy = 0;#ifdef CONFIG_MTK_SCHED_VIP_TASKS/* mtk: If task is VIP task, prefer most efficiency idle cpu */if (is_vip_task(p)) {int vip_idle_cpu;vip_idle_cpu = find_idle_vip_cpu(p);if (vip_idle_cpu >= 0)return vip_idle_cpu;}
#endif/* (1) 优先使用EAS计算target cpu, mtk 对EAS定义了3种模式:EAS模式(energy_aware())、HMP模式(sched_feat(SCHED_HMP))、hybrid_support(EAS、HMP同时共存);hybrid_support()模式下:一般负载均衡交给EAS;如果cpu_rq(cpu)->rd->overutilized负载已经严重不均衡,交给HMP;*//**  Consider EAS if only EAS enabled, but HMP*  if hybrid enabled and system is over-utilized.*/if ((energy_aware() && !hybrid_support()) ||(hybrid_support() && !cpu_rq(cpu)->rd->overutilized))goto CONSIDER_EAS;/* (2) 非EAS情况,fork使用hmp balance *//* HMP fork balance:* always put non-kernel forking tasks on a big domain*/if (sched_feat(SCHED_HMP) && p->mm && (sd_flag & SD_BALANCE_FORK)) {new_cpu = hmp_fork_balance(p, prev_cpu);/* to recover new_cpu value if something wrong */if (new_cpu >= nr_cpu_ids)new_cpu = prev_cpu;else {
#ifdef CONFIG_MTK_SCHED_TRACERStrace_sched_select_task_rq(p, (LB_FORK | new_cpu), prev_cpu, new_cpu);
#endifreturn new_cpu;}}CONSIDER_EAS:/* (3) 如果唤醒flag中设置了SD_BALANCE_WAKE,优先使用唤醒cpu来运行进程p,还需判断下面3个条件是否满足:!wake_wide(p)           // 当前cpu的唤醒次数没有超标task_fits_max(p, cpu)   // 当前cpu的capacity能容纳进程p的utilcpumask_test_cpu(cpu, tsk_cpus_allowed(p)) // 当前cpu在进程在P的affinity中EAS利用了want_affine这个标志,只要EAS使能,want_affine =1*/if (sd_flag & SD_BALANCE_WAKE)want_affine = (!wake_wide(p) && task_fits_max(p, cpu) &&cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) ||energy_aware();rcu_read_lock();/* (4) 从下往上遍历当前cpu的sd,查询在哪个层次的sd进行负载均衡 */for_each_domain(cpu, tmp) {/* (4.1 如果当前sd不支持负载均SD_LOAD_BALANCE,退出) */if (!(tmp->flags & SD_LOAD_BALANCE))break;/* (4.2) 优先找affine_sd,找到直接break;需要符合以下3个条件:want_affine                     //(tmp->flags & SD_WAKE_AFFINE)   // 当前sd支持SD_WAKE_AFFINE标志cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))  //当前sd->span[]中同时包含cpu、pre_cpu*//** If both cpu and prev_cpu are part of this domain,* cpu is a valid SD_WAKE_AFFINE target.*/if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {affine_sd = tmp;break;}/* (4.3) 其次找一个符合sd_flag的sd */if (tmp->flags & sd_flag)sd = tmp;/* (4.4) 如果以上都失败,直接跳出 */else if (!want_affine)break;}/* (5) 如果affine_sd成功找到*/if (affine_sd) {sd = NULL; /* Prefer wake_affine over balance flags */if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))new_cpu = cpu;}/* (6) 没有找到符合sd_flag的sd */if (!sd) {/* (6.1) EAS使能,且本cpu没有overutilized, 使用EAS负载均衡算法*/if (energy_aware() && !cpu_rq(cpu)->rd->overutilized) {new_cpu = energy_aware_wake_cpu(p, prev_cpu);policy |= LB_EAS;}/* (6.2) 如果不能使用EAS,且sd_flag中设置SD_BALANCE_WAKE标志 尝试在唤醒的cpu上运行p进程,ooooo前面辛苦计算的affine_sd没有派上用场?*/else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */if (true) {
#ifdef CONFIG_CGROUP_SCHEDTUNEbool prefer_idle = schedtune_prefer_idle(p) > 0;
#elsebool prefer_idle = true;
#endifint idle_cpu;idle_cpu = find_best_idle_cpu(p, prefer_idle);if (idle_cpu >= 0) {new_cpu = idle_cpu;policy |= LB_IDLEST;} else {new_cpu = select_max_spare_capacity_cpu(p, new_cpu);policy |= LB_SPARE;}} else/* (6.3) 不符合上述条件下的默认处理,尝试找一个idle cpu */new_cpu = select_idle_sibling(p, new_cpu);}} else while (sd) {/* (7) 找到符合sd_flag的sd */struct sched_group *group;int weight;policy |= LB_SMP;/* (7.1) */if (!(sd->flags & sd_flag)) {sd = sd->child;continue;}/* (7.2) */group = find_idlest_group(sd, p, cpu, sd_flag);if (!group) {sd = sd->child;continue;}/* (7.3) */new_cpu = find_idlest_cpu(group, p, cpu);if (new_cpu == -1 || new_cpu == cpu) {/* Now try balancing at a lower domain level of cpu */sd = sd->child;continue;}/* (7.4) *//* Now try balancing at a lower domain level of new_cpu */cpu = new_cpu;weight = sd->span_weight;sd = NULL;for_each_domain(cpu, tmp) {if (weight <= tmp->span_weight)break;if (tmp->flags & sd_flag)sd = tmp;}/* while loop will break here if sd == NULL */}
#ifdef CONFIG_MTK_SCHED_TRACERSpolicy |= (new_cpu << LB_SMP_SHIFT);
#endifrcu_read_unlock();/* (8) 在EAS不能运行的情况下,在做一次HMP的select操作:判断进程p是否符合hmp的迁移条件,如果符合一次迁移到位,避免后续hmp的操作*//*  Consider hmp if no EAS  or over-utiled in hybrid mode. */if ((!energy_aware() && sched_feat(SCHED_HMP)) ||(hybrid_support() && cpu_rq(cpu)->rd->overutilized)) {new_cpu = hmp_select_task_rq_fair(sd_flag, p, prev_cpu, new_cpu);
#ifdef CONFIG_MTK_SCHED_TRACERSpolicy |= (new_cpu << LB_HMP_SHIFT);
#endifpolicy |= LB_HMP;}#ifdef CONFIG_MTK_SCHED_TRACERStrace_sched_select_task_rq(p, policy, prev_cpu, new_cpu);
#endifreturn new_cpu;
}|→inline int hmp_fork_balance(struct task_struct *p, int prev_cpu)
{int new_cpu = prev_cpu;int cpu = smp_processor_id();/* (2.1) prev_cpu所在cluster是最快(fastest)的  */if (hmp_cpu_is_fastest(prev_cpu)) {/* prev_cpu is fastest domain */struct hmp_domain *hmpdom;__always_unused int lowest_ratio;hmpdom = list_entry(&hmp_cpu_domain(prev_cpu)->hmp_domains,struct hmp_domain, hmp_domains);/* (2.2) 尝试选出负载最小的cpu */lowest_ratio = hmp_domain_min_load(hmpdom, &new_cpu);if (new_cpu < nr_cpu_ids && cpumask_test_cpu(new_cpu, tsk_cpus_allowed(p)))return new_cpu;new_cpu = cpumask_any_and(&hmp_faster_domain(cpu)->cpus,tsk_cpus_allowed(p));if (new_cpu < nr_cpu_ids)return new_cpu;} else {/* (2.3) 尝试选出prev_cpu所在cluster中负载最小的cpu *//* prev_cpu is not fastest domain */new_cpu = hmp_select_faster_cpu(p, prev_cpu);if (new_cpu < nr_cpu_ids)return new_cpu;}return new_cpu;
}|→static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
{s64 this_load, load;s64 this_eff_load, prev_eff_load;int idx, this_cpu, prev_cpu;struct task_group *tg;unsigned long weight;int balanced;idx   = sd->wake_idx;this_cpu  = smp_processor_id();prev_cpu  = task_cpu(p);load      = source_load(prev_cpu, idx);this_load = target_load(this_cpu, idx);/* (5.1) *//** If sync wakeup then subtract the (maximum possible)* effect of the currently running task from the load* of the current CPU:*/if (sync) {tg = task_group(current);weight = current->se.avg.load_avg;this_load += effective_load(tg, this_cpu, -weight, -weight);load += effective_load(tg, prev_cpu, 0, -weight);}tg = task_group(p);weight = p->se.avg.load_avg;/** In low-load situations, where prev_cpu is idle and this_cpu is idle* due to the sync cause above having dropped this_load to 0, we'll* always have an imbalance, but there's really nothing you can do* about that, so that's good too.** Otherwise check if either cpus are near enough in load to allow this* task to be woken on this_cpu.*/this_eff_load = 100;this_eff_load *= capacity_of(prev_cpu);prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;prev_eff_load *= capacity_of(this_cpu);if (this_load > 0) {this_eff_load *= this_load +effective_load(tg, this_cpu, weight, weight);prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);}balanced = this_eff_load <= prev_eff_load;schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);if (!balanced)return 0;schedstat_inc(sd, ttwu_move_affine);schedstat_inc(p, se.statistics.nr_wakeups_affine);return 1;
}|→static int energy_aware_wake_cpu(struct task_struct *p, int target)
{int target_max_cap = INT_MAX;int target_cpu = task_cpu(p);unsigned long min_util;unsigned long new_util;int i, cpu;bool is_tiny = false;int nrg_diff = 0;int cluster_id = 0;struct cpumask cluster_cpus;int max_cap_cpu = 0;int best_cpu = 0;/* (6.1.1) 遍历cluster和cpu,找出一个capacity最小的cpu能容纳下util(p)为best_cpu *//** Find group with sufficient capacity. We only get here if no cpu is* overutilized. We may end up overutilizing a cpu by adding the task,* but that should not be any worse than select_idle_sibling().* load_balance() should sort it out later as we get above the tipping* point.*/cluster_id = arch_get_nr_clusters();for (i = 0; i < cluster_id; i++) {arch_get_cluster_cpus(&cluster_cpus, i);max_cap_cpu = cpumask_first(&cluster_cpus);/* Assuming all cpus are the same in group */for_each_cpu(cpu, &cluster_cpus) {if (!cpu_online(cpu))continue;if (capacity_of(max_cap_cpu) < target_max_cap &&task_fits_max(p, max_cap_cpu)) {best_cpu = cpu;target_max_cap = capacity_of(max_cap_cpu);}break;}}if (task_util(p) < TINY_TASK_THRESHOLD)is_tiny = true;/* Find cpu with sufficient capacity */min_util = boosted_task_util(p);if (!is_tiny)/* (6.1.2) 根据best_cpu所在的cluster和进程p的affinity,找出加上util(p)以后,剩余capacity最大的cpu:target_cpu*/target_cpu = select_max_spare_capacity_cpu(p, best_cpu);else/* (6.1.3) 根据cluster和进程p的affinity,找出加上util(p)以后,当前freq的capacity能满足的第一个cpu:target_cpu*/for_each_cpu_and(i, tsk_cpus_allowed(p), &cluster_cpus) {if (!cpu_online(i))continue;/** p's blocked utilization is still accounted for on prev_cpu* so prev_cpu will receive a negative bias due to the double* accounting. However, the blocked utilization may be zero.*/new_util = cpu_util(i) + task_util(p);/** Ensure minimum capacity to grant the required boost.* The target CPU can be already at a capacity level higher* than the one required to boost the task.*/new_util = max(min_util, new_util);#ifdef CONFIG_MTK_SCHED_INTEROPif (cpu_rq(i)->rt.rt_nr_running && likely(!is_rt_throttle(i)))continue;
#endifif (new_util > capacity_orig_of(i))continue;if (new_util < capacity_curr_of(i)) {target_cpu = i;if (cpu_rq(i)->nr_running)break;}/* cpu has capacity at higher OPP, keep it as fallback */if (target_cpu == task_cpu(p))target_cpu = i;}/* (6.1.4) 如果pre_cpu和target_cpu是同一个cluster,直接成功返回 *//* no need energy calculation if the same domain */if (is_the_same_domain(task_cpu(p), target_cpu))return target_cpu;/* no energy comparison if the same cluster */if (target_cpu != task_cpu(p)) {/* (6.1.5) 构造需要迁移的环境变量  */struct energy_env eenv = {.util_delta = task_util(p),.src_cpu    = task_cpu(p),.dst_cpu    = target_cpu,.task       = p,};/* Not enough spare capacity on previous cpu */if (cpu_overutilized(task_cpu(p))) {trace_energy_aware_wake_cpu(p, task_cpu(p), target_cpu,(int)task_util(p), nrg_diff, true, is_tiny);return target_cpu;}/* (6.1.6) 计算进程p从pre_cpu迁移到target_cpu后的功耗差值nrg_diff,如果功耗增加,nrg_diff >= 0,返回pre_cpu即task_cpu(p),如果功耗减少,返回新的target_cpu*/nrg_diff = energy_diff(&eenv);if (nrg_diff >= 0) {trace_energy_aware_wake_cpu(p, task_cpu(p), target_cpu,(int)task_util(p), nrg_diff, false, is_tiny);return task_cpu(p);}}trace_energy_aware_wake_cpu(p, task_cpu(p), target_cpu, (int)task_util(p), nrg_diff, false, is_tiny);return target_cpu;
}||→static inline int
energy_diff(struct energy_env *eenv)
{unsigned int boost;int nrg_delta;/* Conpute "absolute" energy diff */__energy_diff(eenv);/* Return energy diff when boost margin is 0 */
#ifdef CONFIG_CGROUP_SCHEDTUNEboost = schedtune_task_boost(eenv->task);
#elseboost = get_sysctl_sched_cfs_boost();
#endifif (boost == 0)return eenv->nrg.diff;/* Compute normalized energy diff */nrg_delta = normalize_energy(eenv->nrg.diff);eenv->nrg.delta = nrg_delta;eenv->payoff = schedtune_accept_deltas(eenv->nrg.delta,eenv->cap.delta,eenv->task);/** When SchedTune is enabled, the energy_diff() function will return* the computed energy payoff value. Since the energy_diff() return* value is expected to be negative by its callers, this evaluation* function return a negative value each time the evaluation return a* positive payoff, which is the condition for the acceptance of* a scheduling decision*/return -eenv->payoff;
}static int __energy_diff(struct energy_env *eenv)
{struct sched_domain *sd;struct sched_group *sg;int sd_cpu = -1, energy_before = 0, energy_after = 0;/* (6.1.6.1) 构造迁移前的环境变量  */struct energy_env eenv_before = {.util_delta = 0,.src_cpu    = eenv->src_cpu,.dst_cpu    = eenv->dst_cpu,.nrg        = { 0, 0, 0, 0},.cap        = { 0, 0, 0 },};
#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORTint i;
#endifif (eenv->src_cpu == eenv->dst_cpu)return 0;#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT/* To get max opp index of every cluster for power estimation of share buck */for (i = 0; i < arch_get_nr_clusters(); i++) {/* for energy before */eenv_before.opp_idx[i]  = mtk_cluster_capacity_idx(i, &eenv_before);/* for energy after */eenv->opp_idx[i]  = mtk_cluster_capacity_idx(i, eenv);mt_sched_printf(sched_eas_energy_calc, "cid=%d, before max_opp:%d, after max_opp:%d\n",i, eenv_before.opp_idx[i], eenv->opp_idx[i]);}
#endif/* (6.1.6.2) sd来至于cache sd_ea,是cpu对应的顶层sd(tl DIE层) */sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu;sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));if (!sd)return 0; /* Error */mt_sched_printf(sched_eas_energy_calc, "0. %s: move task from src=%d to dst=%d util=%d",__func__, eenv->src_cpu, eenv->dst_cpu, eenv->util_delta);sg = sd->groups;/* (6.1.6.3) 遍历sg所在sg链表,找到符合条件的sg, 累加计算eenv_before、eenv相关sg的功耗*/ do {/* (6.1.6.4) 如果当前sg包含src_cpu或者dst_cpu,计算 */if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) {/* (6.1.6.5) 当前顶层sg为eenv的sg_top  */eenv_before.sg_top = eenv->sg_top = sg;mt_sched_printf(sched_eas_energy_calc, "1. %s: src=%d dst=%d mask=0x%lx (before)",__func__,  eenv_before.src_cpu, eenv_before.dst_cpu, sg->cpumask[0]);/* (6.1.6.6) 计算eenv_before负载下sg的power */if (sched_group_energy(&eenv_before))return 0; /* Invalid result abort */energy_before += eenv_before.energy;/* Keep track of SRC cpu (before) capacity */eenv->cap.before = eenv_before.cap.before;eenv->cap.delta = eenv_before.cap.delta;mt_sched_printf(sched_eas_energy_calc, "2. %s: src=%d dst=%d mask=0x%lx (after)",__func__,  eenv->src_cpu, eenv->dst_cpu, sg->cpumask[0]);/* (6.1.6.7) 计算eenv负载下sg的power */if (sched_group_energy(eenv))return 0; /* Invalid result abort */energy_after += eenv->energy;}} while (sg = sg->next, sg != sd->groups);/* (6.1.6.8) 计算energy_after - energy_before */eenv->nrg.before = energy_before;eenv->nrg.after = energy_after;eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;eenv->payoff = 0;trace_sched_energy_diff(eenv->task,eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,eenv->cap.before, eenv->cap.after, eenv->cap.delta,eenv->nrg.delta, eenv->payoff);mt_sched_printf(sched_eas_energy_calc, "5. %s: nrg.diff=%d cap.delta=%d",__func__, eenv->nrg.diff, eenv->cap.delta);return eenv->nrg.diff;
}|||→static int sched_group_energy(struct energy_env *eenv)
{struct sched_domain *sd;int cpu, total_energy = 0;struct cpumask visit_cpus;struct sched_group *sg;
#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORTint only_lv1_sd = 0;
#endifWARN_ON(!eenv->sg_top->sge);cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));/* (6.1.6.6.1) 根据sg_top顶层sd,找到需要计算的cpu集合visit_cpus,逐个遍历其中每一个cpuooooo这一套复杂的循环算法计算下来,其实就计算了几个power,以cpu0-cpu3为例:4个底层sg的power + 1个顶层sg的power*/ while (!cpumask_empty(&visit_cpus)) {struct sched_group *sg_shared_cap = NULL;/* (6.1.6.6.2) 选取visit_cpus中的第一个cpu */cpu = cpumask_first(&visit_cpus);sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);if (!sd) {/* a corner racing with hotplug? sd doesn't exist in this cpu. */return -EINVAL;}/** Is the group utilization affected by cpus outside this* sched_group?*/sd = rcu_dereference(per_cpu(sd_scs, cpu));
#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT/* Try to handle one CPU in this cluster by hotplug.* In it there is only lv-1 sched_domain exist which having* no share_cap_states.*/if (!sd) {sd = rcu_dereference(per_cpu(sd_ea, cpu));only_lv1_sd = 1;}
#endifif (!sd) {/** We most probably raced with hotplug; returning a* wrong energy estimation is better than entering an* infinite loop.*/return -EINVAL;}if (sd->parent)sg_shared_cap = sd->parent->groups;/* (6.1.6.6.3) 从底层到顶层逐个遍历cpu所在的sd */for_each_domain(cpu, sd) {sg = sd->groups;/* (6.1.6.6.4) 如果是顶层sd,只会计算一个sg *//* Has this sched_domain already been visited? */if (sd->child && group_first_cpu(sg) != cpu)break;/* (6.1.6.6.5) 逐个遍历该层次sg链表所在sg */do {unsigned long group_util;int sg_busy_energy, sg_idle_energy;int cap_idx, idle_idx;if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)eenv->sg_cap = sg_shared_cap;elseeenv->sg_cap = sg;/* (6.1.6.6.6) 根据eenv指示的负载变化,找出满足该sg中最大负载cpu的capacity_index */cap_idx = find_new_capacity(eenv, sg->sge);if (sg->group_weight == 1) {/* Remove capacity of src CPU (before task move) */if (eenv->util_delta == 0 &&cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {eenv->cap.before = sg->sge->cap_states[cap_idx].cap;eenv->cap.delta -= eenv->cap.before;}/* Add capacity of dst CPU  (after task move) */if (eenv->util_delta != 0 &&cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {eenv->cap.after = sg->sge->cap_states[cap_idx].cap;eenv->cap.delta += eenv->cap.after;}}/* (6.1.6.6.7) 找出sg所有cpu中最小的idle index */idle_idx = group_idle_state(sg);/* (6.1.6.6.8) 累加sg中所有cpu的相对负载,最大负载为sg->sge->cap_states[eenv->cap_idx].cap*/group_util = group_norm_util(eenv, sg);/* (6.1.6.6.9) 计算power = busy_power + idle_power */
#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT/** To support power estimation for MTK soc.* Consider share buck for dynamic power and SPARK/MCDI for static power.*/sg_busy_energy = (group_util *sg->sge->busy_power(group_first_cpu(sg), eenv, (sd->child) ? 1 : 0))>> SCHED_CAPACITY_SHIFT;sg_idle_energy = ((SCHED_LOAD_SCALE - group_util) *sg->sge->idle_power(idle_idx, group_first_cpu(sg), eenv, (sd->child) ? 1 : 0))>> SCHED_CAPACITY_SHIFT;
#else/* Power value had been separated to static + dynamic here */sg_busy_energy = (group_util * (sg->sge->cap_states[cap_idx].dyn_pwr +sg->sge->cap_states[cap_idx].lkg_pwr[sg->sge->lkg_idx]))>> SCHED_CAPACITY_SHIFT;sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) *sg->sge->idle_states[idle_idx].power)>> SCHED_CAPACITY_SHIFT;
#endiftotal_energy += sg_busy_energy + sg_idle_energy;mt_sched_printf(sched_eas_energy_calc, "busy_energy=%d idle_eneryg=%d (cost=%d)",sg_busy_energy, sg_idle_energy, total_energy);/* (6.1.6.6.10) 如果遍历了底层sd,从visit_cpus中去掉对应的sg cpu */if (!sd->child)cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT/** We try to get correct energy estimation while racing with hotplug* and avoid entering a infinite loop.*/if (only_lv1_sd) {eenv->energy = total_energy;return 0;}
#endifif (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))goto next_cpu;} while (sg = sg->next, sg != sd->groups);}/* (6.1.6.6.11) 如果遍历了cpu的底层到顶层sd,从visit_cpus中去掉对应的cpu */
next_cpu:cpumask_clear_cpu(cpu, &visit_cpus);continue;}eenv->energy = total_energy;return 0;
}|→static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,int this_cpu, int sd_flag)
{struct sched_group *idlest = NULL, *group = sd->groups;struct sched_group *fit_group = NULL;unsigned long min_load = ULONG_MAX, this_load = 0;unsigned long fit_capacity = ULONG_MAX;int load_idx = sd->forkexec_idx;int imbalance = 100 + (sd->imbalance_pct-100)/2;/* (7.2.1) 选择load_idx */if (sd_flag & SD_BALANCE_WAKE)load_idx = sd->wake_idx;/* (7.2.2) 当前cpu所在sd层次的sg,遍历sg所在的sg链表,选出负载最轻的idlest sg */do {unsigned long load, avg_load;int local_group;int i;/* (7.2.3) 略过不符合p进程affinity的sg *//* Skip over this group if it has no CPUs allowed */if (!cpumask_intersects(sched_group_cpus(group),tsk_cpus_allowed(p)))continue;/* (7.2.4) local_group等于本cpu所在的sg */local_group = cpumask_test_cpu(this_cpu,sched_group_cpus(group));/* Tally up the load of all CPUs in the group */avg_load = 0;/* (7.2.5) 遍历sg中的所有cpu,累加负载 */for_each_cpu(i, sched_group_cpus(group)) {/* Bias balancing toward cpus of our domain */if (local_group)load = source_load(i, load_idx);elseload = target_load(i, load_idx);#ifdef CONFIG_MTK_SCHED_INTEROPload += mt_rt_load(i);
#endifavg_load += load;/* (7.2.6) 如果EAS使能,找到能最小满足进程p的capacity sg *//** Look for most energy-efficient group that can fit* that can fit the task.*/if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) {fit_capacity = capacity_of(i);fit_group = group;}}/* (7.2.7) 用累计的负载计算相对负载 *//* Adjust by relative CPU capacity of the group */avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;/* (7.2.8) 计算idlest sg */if (local_group) {this_load = avg_load;} else if (avg_load < min_load) {min_load = avg_load;idlest = group;}} while (group = group->next, group != sd->groups);/* (7.2.9) EAS使能,返回fit_group */if (energy_aware() && fit_group)return fit_group;if (!idlest || 100*this_load < imbalance*min_load)return NULL;/* (7.2.11) 否则,返回idlest */return idlest;
}|→static int
find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{unsigned long load, min_load = ULONG_MAX;unsigned int min_exit_latency = UINT_MAX;u64 latest_idle_timestamp = 0;int least_loaded_cpu = this_cpu;int shallowest_idle_cpu = -1;int i;/* (7.3.1) 遍历sg中符合p进程affinity的cpu *//* Traverse only the allowed CPUs */for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {/* (7.3.2) 如果cpu的剩余capacity能容纳下p进程的load */if (task_fits_spare(p, i)) {struct rq *rq = cpu_rq(i);struct cpuidle_state *idle = idle_get_state(rq);/* (7.3.2.1) 优先选出idle状态,且退出idle开销最小的cpu */if (idle && idle->exit_latency < min_exit_latency) {/** We give priority to a CPU whose idle state* has the smallest exit latency irrespective* of any idle timestamp.*/min_exit_latency = idle->exit_latency;latest_idle_timestamp = rq->idle_stamp;shallowest_idle_cpu = i;} else if (idle_cpu(i) &&(!idle || idle->exit_latency == min_exit_latency) &&rq->idle_stamp > latest_idle_timestamp) {/** If equal or no active idle state, then* the most recently idled CPU might have* a warmer cache.*/latest_idle_timestamp = rq->idle_stamp;shallowest_idle_cpu = i;} else if (shallowest_idle_cpu == -1) {/** If we haven't found an idle CPU yet* pick a non-idle one that can fit the task as* fallback.*/shallowest_idle_cpu = i;}/* (7.3.3) cpu的剩余capacity容纳不下进程p,选出负载最轻的cpu */} else if (shallowest_idle_cpu == -1) {load = weighted_cpuload(i);
#ifdef CONFIG_MTK_SCHED_INTEROPload += mt_rt_load(i);
#endifif (load < min_load || (load == min_load && i == this_cpu)) {min_load = load;least_loaded_cpu = i;}}}return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}|→static int hmp_select_task_rq_fair(int sd_flag, struct task_struct *p,int prev_cpu, int new_cpu)
{struct list_head *pos;struct sched_entity *se = &p->se;struct cpumask fast_cpu_mask, slow_cpu_mask;#ifdef CONFIG_HMP_TRACERint cpu = 0;for_each_online_cpu(cpu)trace_sched_cfs_runnable_load(cpu, cfs_load(cpu), cfs_length(cpu));
#endif/* error handling */if (prev_cpu >= num_possible_cpus())return new_cpu;/** Skip all the checks if only one CPU is online.* Otherwise, select the most appropriate CPU from cluster.*/if (num_online_cpus() == 1)goto out;/* (8.1) 找出fastest hmp_domain,只有一个, 找出slow hmp_domain,有多个,在一个fast_cpu_mask和多个slow_cpu_mask之间,逐个尝试hmp_select_task_migration()p进程是否会满足hmp迁移*/cpumask_clear(&fast_cpu_mask);cpumask_clear(&slow_cpu_mask);/* order: fast to slow hmp domain */list_for_each(pos, &hmp_domains) {struct hmp_domain *domain = list_entry(pos, struct hmp_domain, hmp_domains);if (!cpumask_empty(&domain->cpus)) {if (cpumask_empty(&fast_cpu_mask)) {cpumask_copy(&fast_cpu_mask, &domain->possible_cpus);} else {cpumask_copy(&slow_cpu_mask, &domain->possible_cpus);new_cpu = hmp_select_task_migration(sd_flag, p,prev_cpu, new_cpu, &fast_cpu_mask, &slow_cpu_mask);}}}out:/* it happens when num_online_cpus=1 */if (new_cpu >= nr_cpu_ids) {/* BUG_ON(1); */new_cpu = prev_cpu;}cfs_nr_pending(new_cpu)++;cfs_pending_load(new_cpu) += se_load(se);return new_cpu;}||→static int hmp_select_task_migration(int sd_flag, struct task_struct *p, int prev_cpu, int new_cpu,struct cpumask *fast_cpu_mask, struct cpumask *slow_cpu_mask)
{int step = 0;struct sched_entity *se = &p->se;int B_target = num_possible_cpus();int L_target = num_possible_cpus();struct clb_env clbenv;/* (8.1.1) 找出fast_cpu_mask中负载最轻的cpu B_target,且符合p进程的affinity */B_target = hmp_select_cpu(HMP_SELECT_RQ, p, fast_cpu_mask, prev_cpu, 0);/* (8.1.2) 找出slow_cpu_mask中负载最轻的cpu L_target,且符合p进程的affinity */L_target = hmp_select_cpu(HMP_SELECT_RQ, p, slow_cpu_mask, prev_cpu, 1);/** Only one cluster exists or only one cluster is allowed for this task* Case 1: return the runqueue whose load is minimum* Case 2: return original CFS runqueue selection result*/if (B_target >= num_possible_cpus() && L_target >= num_possible_cpus())goto out;if (B_target >= num_possible_cpus())goto select_slow;if (L_target >= num_possible_cpus())goto select_fast;/** Two clusters exist and both clusters are allowed for this task* Step 1: Move newly created task to the cpu where no tasks are running* Step 2: Migrate heavy-load task to big* Step 3: Migrate light-load task to LITTLE* Step 4: Make sure the task stays in its previous hmp domain*/step = 1;if (task_created(sd_flag) && !task_low_priority(p->prio)) {if (!rq_length(B_target))goto select_fast;if (!rq_length(L_target))goto select_slow;}/* (8.1.3) 计算如果L_target和B_target发生hmp迁移,各种负载和thershold的计算 */memset(&clbenv, 0, sizeof(clbenv));clbenv.flags |= HMP_SELECT_RQ;cpumask_copy(&clbenv.lcpus, slow_cpu_mask);cpumask_copy(&clbenv.bcpus, fast_cpu_mask);clbenv.ltarget = L_target;clbenv.btarget = B_target;sched_update_clbstats(&clbenv);/* (8.1.4) 判断进程p从L_target up到 B_target的可行性 */step = 2;if (hmp_up_migration(L_target, &B_target, se, &clbenv))goto select_fast;/* (8.1.5) 判断进程p从B_target down到 L_target的可行性 */step = 3;if (hmp_down_migration(B_target, &L_target, se, &clbenv))goto select_slow;/* (8.1.6) 如果prev_cpu是slowest */step = 4;if (hmp_cpu_is_slowest(prev_cpu))goto select_slow;goto select_fast;/* (8.1.7) 返回 B_target */
select_fast:new_cpu = B_target;cpumask_clear(slow_cpu_mask);goto out;/* (8.1.8) 返回 L_target */
select_slow:new_cpu = L_target;cpumask_copy(fast_cpu_mask, slow_cpu_mask);cpumask_clear(slow_cpu_mask);goto out;out:
#ifdef CONFIG_HMP_TRACERtrace_sched_hmp_load(clbenv.bstats.load_avg, clbenv.lstats.load_avg);
#endifreturn new_cpu;
}

4.2、HMP负载均衡

除了SMP load_balance()负载均衡以外,我们还希望在多个SMP cluster之间能遵守一种规则:heavy任务跑在big core上,light任务跑在little core上,这样能快速的达到一个合理的负载状态。这种算法就叫做HMP负载均衡,EAS会统一的考虑负载、性能、功耗,EAS使能后HMP就被禁用了。

HMP负载均衡的操作分两种:

  • 1、heavy task从little cpu迁移到big cpu。这种叫做up操作,对应的函数hmp_force_up_migration();
  • 2、light task从big cpu迁移到little cpu。这种叫做down操作,对应的函数hmp_force_down_migration();

4.2.1、hmp domain初始化

hmp在初始化的时候会每个cluster分配一个hmp_domain,把所有hmp_domain加入到全局链表hmp_domains中。hmp_domains链表构建完成以后,离链表头hmp_domains最近的hmp_domain是速度最快的cluster,离hmp_domains越远hmp_domain对应的速度越慢。因为在构造链表时是按照cluster id来加入的,速度最快cluster的hmp_domain最后加入,所以离表头最近。

static int __init hmp_cpu_mask_setup(void)
{struct hmp_domain *domain;struct list_head *pos;int dc, cpu;pr_warn("Initializing HMP scheduler:\n");/* Initialize hmp_domains using platform code *//* (1) 调用arch相关的hmp_domains初始化函数 */arch_get_hmp_domains(&hmp_domains);if (list_empty(&hmp_domains)) {pr_warn("HMP domain list is empty!\n");return 0;}/* Print hmp_domains */dc = 0;list_for_each(pos, &hmp_domains) {domain = list_entry(pos, struct hmp_domain, hmp_domains);for_each_cpu(cpu, &domain->possible_cpus) {/* (2) 给per_cpu变量hmp_cpu_domain赋值 */per_cpu(hmp_cpu_domain, cpu) = domain;}dc++;}return 1;
}|→void __init arch_get_hmp_domains(struct list_head *hmp_domains_list)
{struct hmp_domain *domain;struct cpumask cpu_mask;int id, maxid;cpumask_clear(&cpu_mask);maxid = arch_get_nr_clusters();/** Initialize hmp_domains* Must be ordered with respect to compute capacity.* Fastest domain at head of list.*//* (1.1) 按照cluster id初始化对应的hmp_domain */for (id = 0; id < maxid; id++) {arch_get_cluster_cpus(&cpu_mask, id);domain = (struct hmp_domain *)kmalloc(sizeof(struct hmp_domain), GFP_KERNEL);cpumask_copy(&domain->possible_cpus, &cpu_mask);cpumask_and(&domain->cpus, cpu_online_mask, &domain->possible_cpus);/* (1.2) 将hmp_domain加入到全局链表hmp_domains_list即hmp_domains中 */list_add(&domain->hmp_domains, hmp_domains_list);}
}

4.2.2、hmp_force_up_migration()

hmp_force_up_migration()的操作主要有以下几个步骤:

需要重点提一下的是:负载计算计算了3种负载(load_avg、loadwop_avg、util_avg),rebalance_domains主要使用其中的loadwop_avg

  • 1、根据当前cpu,选择fast_cpu_mask、slow_cpu_mask;

hmp_force_up_migration尝试把slow cpu上的heavy进程迁移到fast cpu上,关于slow、fast的选择有以下几种场景:

  • 2、选择当前cpu的heaviest进程作为迁移进程p;并不会遍历cpu上所有进程去选出heaviest进程,只会查询curr进程和cfs_rq中5个进程中的heaviest;

  • 3、根据fast_cpu_mask,选择一个负载最少的target cpu;

  • 4、根据源cpu(curr_cpu)、目的cpu(target_cpu),计算负载;

重要的数据计算方法:

重要数据 所属结构 含义 更新/获取函数 计算方法
clbenv->bstats.cpu_power clbenv->bstats B族cpu的绝对计算能力 sched_update_clbstats() arch_scale_cpu_capacity(NULL, clbenv->btarget)
clbenv->lstats.cpu_power clbenv->lstats L族cpu的绝对计算能力 sched_update_clbstats() arch_scale_cpu_capacity(NULL, clbenv->ltarget)
clbenv->lstats.cpu_capacity clbenv->lstats B族cpu的相对计算能力,大于1024 sched_update_clbstats() SCHED_CAPACITY_SCALE * clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1)
clbenv->bstats.cpu_capacity clbenv->bstats L族cpu的相对计算能力,等于1024 sched_update_clbstats() SCHED_CAPACITY_SCALE
clbs->ncpu clbenv->bstats/clbenv->lstats L族/B族online的cpu数量 collect_cluster_stats() if (cpu_online(cpu)) clbs->ncpu++;
clbs->ntask clbenv->bstats/clbenv->lstats L族/B族所有online cpu中所有层级se的总和 collect_cluster_stats() clbs->ntask += cpu_rq(cpu)->cfs.h_nr_running;
clbs->load_avg clbenv->bstats/clbenv->lstats L族/B族online cpu的平均runnable负载,不带weight collect_cluster_stats() sum(cpu_rq(cpu)->cfs.avg.loadwop_avg)/clbs->ncpu
clbs->scaled_acap clbenv->bstats/clbenv->lstats L族/B族target cpu计算能力的剩余值 collect_cluster_stats() hmp_scale_down(clbs->cpu_capacity - cpu_rq(target)->cfs.avg.loadwop_avg)
clbs->scaled_atask clbenv->bstats/clbenv->lstats L族/B族target cpu的task space的剩余值 collect_cluster_stats() hmp_scale_down(clbs->cpu_capacity - cpu_rq(target)->cfs.h_nr_running * cpu_rq(target)->cfs.avg.loadwop_avg)
clbenv->bstats.threshold clbenv->bstats 进程要up迁移到B族的负载门限值 adj_threshold() HMP_MAX_LOAD - HMP_MAX_LOAD * b_nacap * b_natask / ((b_nacap + l_nacap) * (b_natask + l_natask) + 1);b_nacap、b_natask会乘以一个放大系数(b_cpu_power/l_cpu_power),类似如cpu_capacity的计算
clbenv->lstats.threshold clbenv->lstats 进程要down迁移到L族的负载门限值 adj_threshold() HMP_MAX_LOAD * l_nacap * l_natask / ((b_nacap + l_nacap) * (b_natask + l_natask) + 1);b_nacap、b_natask会乘以一个放大系数(b_cpu_power/l_cpu_power),类似如cpu_capacity的计算
  • 5、根据计算的负载情况,判断进程p是否符合up迁移条件((se_load(se) > B->threshold),等其他条件);

up-migration条件列表(hmp_up_migration()):

条件 含义 计算方法 计算解析
[1] Migration stabilizing 如果target cpu刚做过up迁移,不适合再进行迁移 if (!hmp_up_stable(*target_cpu)) check->result = 0; (((now - hmp_last_up_migration(cpu)) >> 10)
static void run_rebalance_domains(struct softirq_action *h)
{struct rq *this_rq = this_rq();enum cpu_idle_type idle = this_rq->idle_balance ?CPU_IDLE : CPU_NOT_IDLE;int this_cpu = smp_processor_id();/* bypass load balance of HMP if EAS consideration *//* (1) 在EAS不使能的情况下,尝试进行HMP负载均衡 */if ((!energy_aware() && sched_feat(SCHED_HMP)) ||(hybrid_support() && cpu_rq(this_cpu)->rd->overutilized))hmp_force_up_migration(this_cpu);/** If this cpu has a pending nohz_balance_kick, then do the* balancing on behalf of the other idle cpus whose ticks are* stopped. Do nohz_idle_balance *before* rebalance_domains to* give the idle cpus a chance to load balance. Else we may* load balance only within the local sched_domain hierarchy* and abort nohz_idle_balance altogether if we pull some load.*/nohz_idle_balance(this_rq, idle);rebalance_domains(this_rq, idle);
}|→static void hmp_force_up_migration(int this_cpu)
{int curr_cpu, target_cpu;astruct sched_entity *se;struct rq *target;unsigned long flags;unsigned int force = 0;struct task_struct *p;struct clb_env clbenv;
#ifdef CONFIG_SCHED_HMP_PLUSstruct sched_entity *orig;
#endifif (!spin_trylock(&hmp_force_migration))return;#ifdef CONFIG_HMP_TRACERfor_each_online_cpu(curr_cpu)trace_sched_cfs_runnable_load(curr_cpu, cfs_load(curr_cpu), cfs_length(curr_cpu));
#endif/* Migrate heavy task from LITTLE to big *//* (1.1) 逐个online cpu尝试进行heavy task从little cpu到big cpu的迁移 */for_each_online_cpu(curr_cpu) {struct hmp_domain *hmp_domain = NULL;struct cpumask fast_cpu_mask, slow_cpu_mask;cpumask_clear(&fast_cpu_mask);cpumask_clear(&slow_cpu_mask);/* (1.2) 如果当前cpu不属于速度最快(fastest)的domain,则尝试进行up操作*/if (!hmp_cpu_is_fastest(curr_cpu)) {/* current cpu is slow_cpu_mask*//* (1.2.1) 当前cpu所在的hmp_domain为slow_cpu_mask */hmp_domain = hmp_cpu_domain(curr_cpu);cpumask_copy(&slow_cpu_mask, &hmp_domain->possible_cpus);/* (1.2.2) 最fastest且online的hmp_domain为fast_cpu_mask */while (&hmp_domain->hmp_domains != hmp_domains.next) {struct list_head *pos = &hmp_domain->hmp_domains;hmp_domain = list_entry(pos->prev, struct hmp_domain, hmp_domains);if (!cpumask_empty(&hmp_domain->cpus)) {cpumask_copy(&fast_cpu_mask, &hmp_domain->possible_cpus);break;}}} else {/* (1.3) 如果当前cpu属于速度最快(fastest)的domain,则直接进行down操作*/hmp_force_down_migration(this_cpu);continue;}if (!hmp_domain || hmp_domain == hmp_cpu_domain(curr_cpu))continue;if (cpumask_empty(&fast_cpu_mask) || cpumask_empty(&slow_cpu_mask))continue;force = 0;/* (1.4) 取出当前cpu的当前cfs进程 */target = cpu_rq(curr_cpu);raw_spin_lock_irqsave(&target->lock, flags);se = target->cfs.curr;if (!se) {raw_spin_unlock_irqrestore(&target->lock, flags);continue;}/* Find task entity */if (!entity_is_task(se)) {struct cfs_rq *cfs_rq;cfs_rq = group_cfs_rq(se);while (cfs_rq) {se = cfs_rq->curr;cfs_rq = group_cfs_rq(se);}}
#ifdef CONFIG_SCHED_HMP_PLUSorig = se;/* (1.5) 或者取出当前cpu前5个cfs进程中,负载最重(heaviest)的进程 */se = hmp_get_heaviest_task(se, -1);if (!se) {raw_spin_unlock_irqrestore(&target->lock, flags);continue;}if (!entity_is_task(se))p = task_of(orig);else
#endifp = task_of(se);/* (1.6) 选择fast_cpu_mask domain中,负载最少的cpu */target_cpu = hmp_select_cpu(HMP_GB, p, &fast_cpu_mask, -1, 0);if (target_cpu >= num_possible_cpus()) {raw_spin_unlock_irqrestore(&target->lock, flags);continue;}/* Collect cluster information *//* (1.7) up操作的对象已经选择好:源little cpu:curr_cpu目的big cpu:target_cpu*/memset(&clbenv, 0, sizeof(clbenv));clbenv.flags |= HMP_GB;clbenv.ltarget = curr_cpu;clbenv.btarget = target_cpu;cpumask_copy(&clbenv.lcpus, &slow_cpu_mask);cpumask_copy(&clbenv.bcpus, &fast_cpu_mask);/* (1.8) up操作前的数据计算 */sched_update_clbstats(&clbenv);/* Check migration threshold *//* (1.9) 根据计算的数据,判断up操作的可行性 */if (!target->active_balance &&hmp_up_migration(curr_cpu, &target_cpu, se, &clbenv) &&!cpu_park(cpu_of(target))) {if (p->state != TASK_DEAD) {/* 准备从target rq中迁移进程p到target_cpu,设置rq正在处理负载balance标志active_balance */get_task_struct(p);target->active_balance = 1; /* force up */target->push_cpu = target_cpu;target->migrate_task = p;force = 1;trace_sched_hmp_migrate(p, target->push_cpu, 1);hmp_next_up_delay(&p->se, target->push_cpu);}}raw_spin_unlock_irqrestore(&target->lock, flags);/* (1.10) 判断结果是可以进行up操作,则调用hmp_force_up_cpu_stop()进行实际的up操作 */if (force) {if (stop_one_cpu_dispatch(cpu_of(target),hmp_force_up_cpu_stop,target, &target->active_balance_work)) {/* 迁移完成,清除标志 */put_task_struct(p); /* out of rq->lock */raw_spin_lock_irqsave(&target->lock, flags);target->active_balance = 0;force = 0;raw_spin_unlock_irqrestore(&target->lock, flags);}} else/* (1.11) 否则,再尝试进行down操作 */hmp_force_down_migration(this_cpu);}#ifdef CONFIG_HMP_TRACERtrace_sched_hmp_load(clbenv.bstats.load_avg, clbenv.lstats.load_avg);
#endifspin_unlock(&hmp_force_migration);}||→static const int hmp_max_tasks = 5;
static struct sched_entity *hmp_get_heaviest_task(struct sched_entity *se, int target_cpu)
{int num_tasks = hmp_max_tasks;struct sched_entity *max_se = se;unsigned long int max_ratio = se->avg.loadwop_avg;const struct cpumask *hmp_target_mask = NULL;struct hmp_domain *hmp;/* (1.5.1) 如果本cpu是fastest cpu,则不用查找直接返回,因为本函数的目的是找little cpu中的heaviest进程*/if (hmp_cpu_is_fastest(cpu_of(se->cfs_rq->rq)))return max_se;/* (1.5.2) 获取比本cpu fater一级cpu的hmp_domain,作为进程亲和力判断的mask */hmp = hmp_faster_domain(cpu_of(se->cfs_rq->rq));hmp_target_mask = &hmp->cpus;/* (1.5.3) 传入参数target_cpu = -1,所以hmp_target_mask使用的是源cpu hmp_domain的hmp->cpus */if (target_cpu >= 0) {/* idle_balance gets run on a CPU while* it is in the middle of being hotplugged* out. Bail early in that case.*/if (!cpumask_test_cpu(target_cpu, hmp_target_mask))return NULL;hmp_target_mask = cpumask_of(target_cpu);}/* The currently running task is not on the runqueue *//* (1.5.4) 从当前cpu的cfs红黑树中,连续5个进程和curr进程比较,选出heaviest进程 比较使用的负载为se->avg.loadwop_avg,不带weight分量*/se = __pick_first_entity(cfs_rq_of(se));while (num_tasks && se) {if (entity_is_task(se) && se->avg.loadwop_avg > max_ratio &&cpumask_intersects(hmp_target_mask, tsk_cpus_allowed(task_of(se)))) {max_se = se;max_ratio = se->avg.loadwop_avg;}se = __pick_next_entity(se);num_tasks--;}return max_se;
}||→static unsigned int hmp_select_cpu(unsigned int caller, struct task_struct *p,struct cpumask *mask, int prev, int up)
{int curr = 0;int target = num_possible_cpus();unsigned long curr_wload = 0;unsigned long target_wload = 0;struct cpumask srcp;/* (1.6.1) 综合fast_cpu_mask、cpu_online_mask、tsk_cpus_allowed(p),选取first cpu为target*/cpumask_and(&srcp, cpu_online_mask, mask);target = cpumask_any_and(&srcp, tsk_cpus_allowed(p));if (target >= num_possible_cpus())goto out;/** RT class is taken into account because CPU load is multiplied* by the total number of CPU runnable tasks that includes RT tasks.*//*  (1.6.2) 计算target cpu所对应的load,target_wload = (rq->cfs.avg.loadwop_avg + rq->cfs.avg.pending_load) * (rq->nr_running + rq->cfs.avg.nr_pending)该负载会受RT进程的影响,因为rq->nr_running会统计包括RT进程的数量*/target_wload = hmp_inc(cfs_load(target));target_wload += cfs_pending_load(target);target_wload *= rq_length(target);for_each_cpu(curr, mask) {/* Check CPU status and task affinity */if (!cpu_online(curr) || !cpumask_test_cpu(curr, tsk_cpus_allowed(p)))continue;/* For global load balancing, unstable CPU will be bypassed *//* (1.6.3) 如果当前是up操作,如果cpu在短时间内进行了down操作,则不适合马上进行up操作 */if (hmp_caller_is_gb(caller) && !hmp_cpu_stable(curr, up))continue;curr_wload = hmp_inc(cfs_load(curr));curr_wload += cfs_pending_load(curr);curr_wload *= rq_length(curr);/* (1.6.4) 选择load最小的作为target cpu */if (curr_wload < target_wload) {target_wload = curr_wload;target = curr;/* (1.6.5) 在load同样小的情况下,选择prev cpu */} else if (curr_wload == target_wload && curr == prev) {target = curr;}}out:return target;
}||→static void sched_update_clbstats(struct clb_env *clbenv)
{/* init cpu power and capacity *//* (1.8.1) L族和B族的绝对运行能力和相对运算能力,.cpu_power = 绝对运算能力.cpu_capacity = 相对运算能力*/clbenv->bstats.cpu_power = (int) arch_scale_cpu_capacity(NULL, clbenv->btarget);clbenv->lstats.cpu_power = (int) arch_scale_cpu_capacity(NULL, clbenv->ltarget);clbenv->lstats.cpu_capacity = SCHED_CAPACITY_SCALE;clbenv->bstats.cpu_capacity = SCHED_CAPACITY_SCALE * clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1);/* (1.8.2) L族和B族的 */collect_cluster_stats(&clbenv->bstats, &clbenv->bcpus, clbenv->btarget);collect_cluster_stats(&clbenv->lstats, &clbenv->lcpus, clbenv->ltarget);/* (1.8.3) L族和B族的 */adj_threshold(clbenv);
}|||→static void collect_cluster_stats(struct clb_stats *clbs, struct cpumask *cluster_cpus, int target)
{
#define HMP_RESOLUTION_SCALING (4)
#define hmp_scale_down(w) ((w) >> HMP_RESOLUTION_SCALING)/* Update cluster informatics */int cpu;/* (1.8.2.1) 累加本族online cpu的值 */for_each_cpu(cpu, cluster_cpus) {if (cpu_online(cpu)) {clbs->ncpu++;clbs->ntask += cpu_rq(cpu)->cfs.h_nr_running;clbs->load_avg += cpu_rq(cpu)->cfs.avg.loadwop_avg;
#ifdef CONFIG_SCHED_HMP_PRIO_FILTERclbs->nr_normal_prio_task += cfs_nr_normal_prio(cpu);clbs->nr_dequeuing_low_prio += cfs_nr_dequeuing_low_prio(cpu);
#endif}}if (!clbs->ncpu || target >= num_possible_cpus() || !cpumask_test_cpu(target, cluster_cpus))return;/** Calculate available CPU capacity* Calculate available task space** Why load ratio should be multiplied by the number of task ?* The task is the entity of scheduling unit so that we should consider* it in scheduler. Only considering task load is not enough.* Thus, multiplying the number of tasks can adjust load ratio to a more* reasonable value.*//* (1.8.2.2) 计算本族剩余的cpu计算能力 capacity = 相对计算能力(clbs->cpu_capacity) - 本cpu的负载(rq->cfs.avg.loadwop_avg):clbs->cpu_capacity是B族和L族相对的(L是1024,B大于1024),而负载(rq->cfs.avg.loadwop_avg)是相对自己的B族和L族的最大值都是1024*/clbs->load_avg /= clbs->ncpu;clbs->acap = clbs->cpu_capacity - cpu_rq(target)->cfs.avg.loadwop_avg;clbs->scaled_acap = hmp_scale_down(clbs->acap);/* (1.8.2.3) 计算本族剩余的task空间scaled_atask = 相对计算能力(clbs->cpu_capacity) - 本cpu的负载(rq->cfs.avg.loadwop_avg)*本cpu所有的进程数量(rq->cfs.h_nr_running)ooooo这里的计算也不是在同一纬度上的*/clbs->scaled_atask = cpu_rq(target)->cfs.h_nr_running * cpu_rq(target)->cfs.avg.loadwop_avg;clbs->scaled_atask = clbs->cpu_capacity - clbs->scaled_atask;clbs->scaled_atask = hmp_scale_down(clbs->scaled_atask);mt_sched_printf(sched_log, "[%s] cpu/cluster:%d/%02lx load/len:%lu/%u stats:%d,%d,%d,%d,%d,%d,%d,%d\n",__func__, target, *cpumask_bits(cluster_cpus),cpu_rq(target)->cfs.avg.loadwop_avg,cpu_rq(target)->cfs.h_nr_running,clbs->ncpu, clbs->ntask, clbs->load_avg, clbs->cpu_capacity,clbs->acap, clbs->scaled_acap, clbs->scaled_atask, clbs->threshold);
}|||→/** Task Dynamic Migration Threshold Adjustment.** If the workload between clusters is not balanced, adjust migration* threshold in an attempt to move task precisely.** Diff. = Max Threshold - Min Threshold** Dynamic UP-Threshold =*                               B_nacap               B_natask* Max Threshold - Diff. x  -----------------  x  -------------------*                          B_nacap + L_nacap     B_natask + L_natask*** Dynamic Down-Threshold =*                               L_nacap               L_natask* Min Threshold + Diff. x  -----------------  x  -------------------*                          B_nacap + L_nacap     B_natask + L_natask*/
static void adj_threshold(struct clb_env *clbenv)
{
#define POSITIVE(x) ((int)(x) < 0 ? 0 : (x))unsigned long b_cap = 0, l_cap = 0;int b_nacap, l_nacap, b_natask, l_natask;b_cap = clbenv->bstats.cpu_power;l_cap = clbenv->lstats.cpu_power;/* (1.8.3.1) 把B族剩余cpu计算能力和task空间,转换成L族的相对值 */b_nacap = POSITIVE(clbenv->bstats.scaled_acap *clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1));b_natask = POSITIVE(clbenv->bstats.scaled_atask *clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1));/* L族的值维持不变 */      l_nacap = POSITIVE(clbenv->lstats.scaled_acap);l_natask = POSITIVE(clbenv->lstats.scaled_atask);/* (1.8.3.2) 计算up的threshold, up-threshold = HMP_MAX_LOAD - HMP_MAX_LOAD*B族剩余*/clbenv->bstats.threshold = HMP_MAX_LOAD - HMP_MAX_LOAD * b_nacap * b_natask /((b_nacap + l_nacap) * (b_natask + l_natask) + 1);/* (1.8.3.3) 计算down的threshold, down-threshold = HMP_MAX_LOAD*L族剩余*/clbenv->lstats.threshold = HMP_MAX_LOAD * l_nacap * l_natask /((b_nacap + l_nacap) * (b_natask + l_natask) + 1);mt_sched_printf(sched_log, "[%s]\tup/dl:%4d/%4d L(%d:%4lu) b(%d:%4lu)\n", __func__,clbenv->bstats.threshold, clbenv->lstats.threshold,clbenv->ltarget, l_cap, clbenv->btarget, b_cap);
}||→/** Check whether this task should be migrated to big* Briefly summarize the flow as below;* 1) Migration stabilizing* 2) Filter low-priority task* 2.5) Keep all cpu busy* 3) Check CPU capacity* 4) Check dynamic migration threshold*/
static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se,struct clb_env *clbenv)
{struct task_struct *p = task_of(se);struct clb_stats *L, *B;struct mcheck *check;int curr_cpu = cpu;
#ifdef CONFIG_HMP_TRACERunsigned int caller = clbenv->flags;
#endifL = &clbenv->lstats;B = &clbenv->bstats;check = &clbenv->mcheck;check->status = clbenv->flags;check->status |= HMP_TASK_UP_MIGRATION;check->result = 0;/** No migration is needed if* 1) There is only one cluster* 2) Task is already in big cluster* 3) It violates task affinity*/if (!L->ncpu || !B->ncpu|| cpumask_test_cpu(curr_cpu, &clbenv->bcpus)|| !cpumask_intersects(&clbenv->bcpus, tsk_cpus_allowed(p)))goto out;/* (1.9.1) 如果目标cpu短时间内已经执行了up操作,则为up unstable状态,退出 *//** [1] Migration stabilizing* Let the task load settle before doing another up migration.* It can prevent a bunch of tasks from migrating to a unstable CPU.*/if (!hmp_up_stable(*target_cpu))goto out;/* (1.9.2) 过滤掉优先级较低的进程,不进行迁移操作。具体有3个条件:(task_low_priority(p->prio) && \    // nice值大于5(B->ntask >= B->ncpu || 0 != L->nr_normal_prio_task) && \  // B组进程大于cou数 || 正常优先级的进程不为0(p->se.avg.loadwop_avg < 800))  // 平均负载小于800*//* [2] Filter low-priority task */
#ifdef CONFIG_SCHED_HMP_PRIO_FILTERif (hmp_low_prio_task_up_rejected(p, B, L)) {check->status |= HMP_LOW_PRIORITY_FILTER;goto trace;}
#endif/* (1.9.3) 如果B组的target cpu为idle,不用过多判断,直接准备迁移 *//* [2.5]if big is idle, just go to big */if (rq_length(*target_cpu) == 0) {check->status |= HMP_BIG_IDLE;check->status |= HMP_MIGRATION_APPROVED;check->result = 1;goto trace;}/* (1.9.4) 判断B族target cpu的capacity是否足够,(se_load(se) + cfs_load(cpu)) < (B->cpu_capacity - (B->cpu_capacity >> 2))// target cpu负载 + 要迁移的se负载 是否小于 3/4 B族cpu的capacity*//** [3] Check CPU capacity* Forbid up-migration if big CPU can't handle this task*/if (!hmp_task_fast_cpu_afford(B, se, *target_cpu)) {check->status |= HMP_BIG_CAPACITY_INSUFFICIENT;goto trace;}/* (1.9.5) 判断se的负载是否已经大于up-threshold(B->threshold) *//** [4] Check dynamic migration threshold* Migrate task from LITTLE to big if load is greater than up-threshold*/if (se_load(se) > B->threshold) {check->status |= HMP_MIGRATION_APPROVED;check->result = 1;}trace:
#ifdef CONFIG_HMP_TRACERif (check->result && hmp_caller_is_gb(caller))hmp_stats.nr_force_up++;trace_sched_hmp_stats(&hmp_stats);trace_sched_dynamic_threshold(task_of(se), B->threshold, check->status,curr_cpu, *target_cpu, se_load(se), B, L);trace_sched_dynamic_threshold_draw(B->threshold, L->threshold);
#endif
out:return check->result;
}||→static int hmp_force_up_cpu_stop(void *data)
{/* (1.10.1) 执行进程迁移 */return hmp_active_task_migration_cpu_stop(data);
}|||→static int hmp_active_task_migration_cpu_stop(void *data)
{struct rq *busiest_rq = data;struct task_struct *p = NULL;int busiest_cpu = cpu_of(busiest_rq);int target_cpu = busiest_rq->push_cpu;struct rq *target_rq = cpu_rq(target_cpu);struct sched_domain *sd;raw_spin_lock_irq(&busiest_rq->lock);p = busiest_rq->migrate_task;/* make sure the requested cpu hasn't gone down in the meantime */if (unlikely(busiest_cpu != smp_processor_id() ||!busiest_rq->active_balance)) {goto out_unlock;}/* Is there any task to move? */if (busiest_rq->nr_running <= 1)goto out_unlock;/* Are both target and busiest cpu online */if (!cpu_online(busiest_cpu) || !cpu_online(target_cpu))goto out_unlock;/* Task has migrated meanwhile, abort forced migration */if ((!p) || (task_rq(p) != busiest_rq))goto out_unlock;/** This condition is "impossible", if it occurs* we need to fix it. Originally reported by* Bjorn Helgaas on a 128-cpu setup.*/WARN_ON(busiest_rq == target_rq);/* (1.10.1.1) 将源、目的rq lock住 *//* move a task from busiest_rq to target_rq */double_lock_balance(busiest_rq, target_rq);/* (1.10.1.2) 搜索target cpu所在的某一层次的sd,其sd->span[]即包含源cpu又包含目的cpu *//* Search for an sd spanning us and the target CPU. */rcu_read_lock();for_each_domain(target_cpu, sd) {if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))break;}/* (1.10.1.3) 构造数据,在同一sd下进行迁移 */if (likely(sd)) {struct lb_env env = {.sd             = sd,.dst_cpu        = target_cpu,.dst_rq         = target_rq,.src_cpu        = busiest_rq->cpu,.src_rq         = busiest_rq,.idle           = CPU_IDLE,};schedstat_inc(sd, alb_count);/* (1.10.1.4) 任务迁移 */if (move_specific_task(&env, p))schedstat_inc(sd, alb_pushed);elseschedstat_inc(sd, alb_failed);}rcu_read_unlock();double_unlock_balance(busiest_rq, target_rq);
out_unlock:busiest_rq->active_balance = 0;raw_spin_unlock_irq(&busiest_rq->lock);put_task_struct(p);return 0;
}||||→static int move_specific_task(struct lb_env *env, struct task_struct *pm)
{struct task_struct *p, *n;/* (1.10.1.4.1) 从源rq->cfs_tasks逐个取出任务,直到查到pm */list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {/* (1.10.1.4.2) task group的throttled判断 */if (throttled_lb_pair(task_group(p), env->src_rq->cpu,env->dst_cpu))continue;/* (1.10.1.4.3) 判断任务能否被迁移 */if (!hmp_can_migrate_task(p, env))continue;/* Check if we found the right task */if (p != pm)continue;/* (1.10.1.4.4) 迁移 */move_task(p, env);/** Right now, this is only the third place move_task()* is called, so we can safely collect move_task()* stats here rather than inside move_task().*/schedstat_inc(env->sd, lb_gained[env->idle]);return 1;}return 0;
}|||||→static void move_task(struct task_struct *p, struct lb_env *env)
{deactivate_task(env->src_rq, p, 0);set_task_cpu(p, env->dst_cpu);activate_task(env->dst_rq, p, 0);check_preempt_curr(env->dst_rq, p, 0);
}

4.2.3、hmp_force_down_migration()

hmp_force_down_migration()的操作主要有以下几个步骤:

  • 1、根据当前cpu,选择fast_cpu_mask、slow_cpu_mask;

hmp_force_down_migration尝试把fast cpu上的light进程迁移到slow cpu上,关于fast、slow的选择有以下几种场景:

  • 2、选择当前cpu的lightest进程作为迁移进程p;并不会遍历cpu上所有进程去选出lightest进程,只会查询curr进程和cfs_rq中5个进程中的lightest;

  • 3、根据slow_cpu_mask,选择一个负载最少的target cpu;

  • 4、根据源cpu(curr_cpu)、目的cpu(target_cpu),计算负载;

重要的数据计算方法和hmp_force_up_migration()一致,参考上一节;

  • 5、根据计算的负载情况,判断进程p是否符合down迁移条件((L->threshold >= se_load(se)),等其他条件);

down-migration条件列表(hmp_down_migration()):

条件 含义 计算方法 计算解析
[1] Migration stabilizing 如果target cpu刚做过down迁移,不适合再进行迁移 if (!hmp_down_stable(*target_cpu)) check->result = 0; (((now - hmp_last_down_migration(cpu)) >> 10)
static void hmp_force_down_migration(int this_cpu)
{int target_cpu;struct sched_entity *se;struct rq *target;unsigned long flags;unsigned int force = 0;struct task_struct *p;struct clb_env clbenv;
#ifdef CONFIG_SCHED_HMP_PLUSstruct sched_entity *orig;int B_cpu;
#endifstruct hmp_domain *hmp_domain = NULL;struct cpumask fast_cpu_mask, slow_cpu_mask;cpumask_clear(&fast_cpu_mask);cpumask_clear(&slow_cpu_mask);/* Migrate light task from big to LITTLE *//* (1) 如果当前cpu不是最慢的cpu(slowest),则尝试down操作 */if (!hmp_cpu_is_slowest(this_cpu)) {/* (2) 当前cpu所在的hmp_domain为fast_cpu_mask */hmp_domain = hmp_cpu_domain(this_cpu);cpumask_copy(&fast_cpu_mask, &hmp_domain->possible_cpus);/* (3) 查找相比当前最慢且online的hmp_domain作为slow_cpu_mask */while (!list_is_last(&hmp_domain->hmp_domains, &hmp_domains)) {struct list_head *pos = &hmp_domain->hmp_domains;hmp_domain = list_entry(pos->next, struct hmp_domain, hmp_domains);if (!cpumask_empty(&hmp_domain->cpus)) {cpumask_copy(&slow_cpu_mask, &hmp_domain->possible_cpus);break;}}}if (!hmp_domain || hmp_domain == hmp_cpu_domain(this_cpu))return;/* (4) 找不到可操作的fast_cpu_mask、slow_cpu_mask直接返回 */if (cpumask_empty(&fast_cpu_mask) || cpumask_empty(&slow_cpu_mask))return;/* (5) 源cpu = this_cpu,源rq = target */force = 0;target = cpu_rq(this_cpu);raw_spin_lock_irqsave(&target->lock, flags);se = target->cfs.curr;if (!se) {raw_spin_unlock_irqrestore(&target->lock, flags);return;}/* (6) 首先尝试使用curr进程作为down迁移的进程 *//* Find task entity */if (!entity_is_task(se)) {struct cfs_rq *cfs_rq;cfs_rq = group_cfs_rq(se);while (cfs_rq) {se = cfs_rq->curr;cfs_rq = group_cfs_rq(se);}}
#ifdef CONFIG_SCHED_HMP_PLUS/* (7) 在curr进程开始的5个进程中,挑负载最轻的进程作为down迁移进程 */orig = se;se = hmp_get_lightest_task(orig, 1);if (!entity_is_task(se))p = task_of(orig);else
#endifp = task_of(se);
#ifdef CONFIG_SCHED_HMP_PLUS/* (8) 找出B族中负载最轻的cpu,如果其为idle状态,则放弃down操作 因为load_balance中的idle_balance会重新把任务迁移回idle的big cpu,避免相互的乒乓操作*//* Don't offload to little if there is one idle big, let load balance to do it's work *//* Also, to prevent idle_balance from leading to potential ping-pong */B_cpu = hmp_select_cpu(HMP_GB, p, &fast_cpu_mask, this_cpu, 0);if (B_cpu < nr_cpu_ids && !rq_length(B_cpu)) {raw_spin_unlock_irqrestore(&target->lock, flags);return;}
#endif/* (9) 找出L族中负载最轻的cpu作为target_cpu */target_cpu = hmp_select_cpu(HMP_GB, p, &slow_cpu_mask, -1, 1);if (target_cpu >= num_possible_cpus()) {raw_spin_unlock_irqrestore(&target->lock, flags);return;}/* (10) 迁移前对B族、L族负载和threshold的计算 *//* Collect cluster information */memset(&clbenv, 0, sizeof(clbenv));clbenv.flags |= HMP_GB;clbenv.btarget = this_cpu;clbenv.ltarget = target_cpu;cpumask_copy(&clbenv.lcpus, &slow_cpu_mask);cpumask_copy(&clbenv.bcpus, &fast_cpu_mask);sched_update_clbstats(&clbenv);#ifdef CONFIG_SCHED_HMP_PLUSif (cpu_rq(this_cpu)->cfs.h_nr_running < 2) {raw_spin_unlock_irqrestore(&target->lock, flags);return;}
#endif/* (11) 检查down操作的迁移条件是否成立,hmp_down_migration() *//* Check migration threshold */if (!target->active_balance &&hmp_down_migration(this_cpu, &target_cpu, se, &clbenv) &&!cpu_park(cpu_of(target))) {if (p->state != TASK_DEAD) {get_task_struct(p);target->active_balance = 1; /* force down */target->push_cpu = target_cpu;target->migrate_task = p;force = 1;trace_sched_hmp_migrate(p, target->push_cpu, 1);hmp_next_down_delay(&p->se, target->push_cpu);}}raw_spin_unlock_irqrestore(&target->lock, flags);/* (12) 条件成立进行实际的down迁移操作hmp_force_down_cpu_stop() */if (force) {if (stop_one_cpu_dispatch(cpu_of(target),hmp_force_down_cpu_stop,target, &target->active_balance_work)) {put_task_struct(p); /* out of rq->lock */raw_spin_lock_irqsave(&target->lock, flags);target->active_balance = 0;force = 0;raw_spin_unlock_irqrestore(&target->lock, flags);}}}|→static struct sched_entity *hmp_get_lightest_task(struct sched_entity *se, int migrate_down)
{int num_tasks = hmp_max_tasks;struct sched_entity *min_se = se;unsigned long int min_ratio = se->avg.loadwop_avg;const struct cpumask *hmp_target_mask = NULL;if (migrate_down) {struct hmp_domain *hmp;/* (7.1) 如果cpu是最慢cpu(slowest)则直接退出,因为本函数的目的是找出faster cpu中lightest进程*/if (hmp_cpu_is_slowest(cpu_of(se->cfs_rq->rq)))return min_se;/* (7.2) 将更slow一级的hmp_domain作为进程cpu亲和力的mask */hmp = hmp_slower_domain(cpu_of(se->cfs_rq->rq));hmp_target_mask = &hmp->cpus;}/* The currently running task is not on the runqueue */se = __pick_first_entity(cfs_rq_of(se));/* (7.3) 从当前cpu的cfs红黑树中,连续5个进程和curr进程比较,选出lightest进程 比较使用的负载为se->avg.loadwop_avg,不带weight分量*/while (num_tasks && se) {if (entity_is_task(se) &&(se->avg.loadwop_avg < min_ratio && hmp_target_mask &&cpumask_intersects(hmp_target_mask, tsk_cpus_allowed(task_of(se))))) {min_se = se;min_ratio = se->avg.loadwop_avg;}se = __pick_next_entity(se);num_tasks--;}return min_se;
}|→/** Check whether this task should be migrated to LITTLE* Briefly summarize the flow as below;* 1) Migration stabilizing* 1.5) Keep all cpu busy* 2) Filter low-priority task* 3) Check CPU capacity* 4) Check dynamic migration threshold*/
static unsigned int hmp_down_migration(int cpu, int *target_cpu, struct sched_entity *se,struct clb_env *clbenv)
{struct task_struct *p = task_of(se);struct clb_stats *L, *B;struct mcheck *check;int curr_cpu = cpu;unsigned int caller = clbenv->flags;L = &clbenv->lstats;B = &clbenv->bstats;check = &clbenv->mcheck;check->status = caller;check->status |= HMP_TASK_DOWN_MIGRATION;check->result = 0;/** No migration is needed if* 1) There is only one cluster* 2) Task is already in LITTLE cluster* 3) It violates task affinity*/if (!L->ncpu || !B->ncpu|| cpumask_test_cpu(curr_cpu, &clbenv->lcpus)|| !cpumask_intersects(&clbenv->lcpus, tsk_cpus_allowed(p)))goto out;/* (11.1) 目的little cpu target_cpu近期如果有做过down操作,不适合再做down迁移 *//** [1] Migration stabilizing* Let the task load settle before doing another down migration.* It can prevent a bunch of tasks from migrating to a unstable CPU.*/if (!hmp_down_stable(*target_cpu))goto out;/* (11.2) 如果big busy,little idle则不用进行threshold判断 *//* [1.5]if big is busy and little is idle, just go to little */if (rq_length(*target_cpu) == 0 && caller == HMP_SELECT_RQ && rq_length(curr_cpu) > 0) {struct rq *curr_rq = cpu_rq(curr_cpu);/* (11.2.1) 如果big cpu,curr进程不是heavy进程,但是p是heavy进程,直接准许down迁移 heavy进程的判断标准为:负载>=650*//* if current big core is not heavy task and wake up task is heavy task no go to little */if (!(!is_heavy_task(curr_rq->curr) && is_heavy_task(p))) {check->status |= HMP_BIG_BUSY_LITTLE_IDLE;check->status |= HMP_MIGRATION_APPROVED;check->result = 1;goto trace;}}/* (11.3) 低优先级进程,如果满足以下条件,准许迁移:(task_low_priority(p->prio) && !B->nr_dequeuing_low_prio && \   // nice值大于5B->ntask >= B->ncpu && 0 != L->nr_normal_prio_task && \        // B和L都不是特别空闲(p->se.avg.loadwop_avg < 800))                                 // L上准备迁移的进程负载小于800*//* [2] Filter low-priority task */
#ifdef CONFIG_SCHED_HMP_PRIO_FILTERif (hmp_low_prio_task_down_allowed(p, B, L)) {cfs_nr_dequeuing_low_prio(curr_cpu)++;check->status |= HMP_LOW_PRIORITY_FILTER;check->status |= HMP_MIGRATION_APPROVED;check->result = 1;goto trace;}
#endif/** [3] Check CPU capacity* Forbid down-migration if either of the following conditions is true* 1) big cpu is not oversubscribed (if big CPU seems to have spare*    cycles, do not force this task to run on LITTLE CPU, but*    keep it staying in its previous cluster instead)* 2) LITTLE cpu doesn't have available capacity for this new task*//* (11.4) 如果big cpu有足够的空闲周期,不需要强制把light任务迁移到little cpu上 cfs_load(cpu) < (B->cpu_capacity - (B->cpu_capacity >> 2))*/if (!hmp_fast_cpu_oversubscribed(caller, B, se, curr_cpu)) {check->status |= HMP_BIG_NOT_OVERSUBSCRIBED;goto trace;}/* (11.5) 判断L族cpu的capacity是否足够容纳需要迁移的进程,(L->acap > 0 && L->acap >= se_load(se))*/if (!hmp_task_slow_cpu_afford(L, se)) {check->status |= HMP_LITTLE_CAPACITY_INSUFFICIENT;goto trace;}/* (11.6) 判断se的负载是否已经小于down-threshold(L->threshold) *//** [4] Check dynamic migration threshold* Migrate task from big to LITTLE if load ratio is less than* or equal to down-threshold*/if (L->threshold >= se_load(se)) {check->status |= HMP_MIGRATION_APPROVED;check->result = 1;}trace:
#ifdef CONFIG_HMP_TRACERif (check->result && hmp_caller_is_gb(caller))hmp_stats.nr_force_down++;trace_sched_hmp_stats(&hmp_stats);trace_sched_dynamic_threshold(task_of(se), L->threshold, check->status,curr_cpu, *target_cpu, se_load(se), B, L);trace_sched_dynamic_threshold_draw(B->threshold, L->threshold);
#endif
out:return check->result;
}

4.2.4、hmp_select_task_rq_fair()

4.3、cpu freq调整

前面讲的负载均衡的手段都是负载迁移,把负载迁移到最idle或者最省power的cpu上。另外一种方式就是调整cpu的freq,从而改变cpu的curr_capacity,来满足性能和功耗的需求。

cpu的频率调整是基于3个层次的:cpufreq governor、cpufreq core、cpufreq driver。

  • 1、cpufreq governor决定cpu调频的算法,计算负载、根据负载的变化来动态调整频率;
  • 2、cpufreq core对通用层进行了一些封装,比如cpufreq_policy的封装;
  • 3、cpufreq driver是底层操作的实现,比如freq_table的初始化、cpu target频率的配置;

如果是MTK平台,cpufreq driver除了接受governor的频率调整还需要接受ppm的频率调整,它的框图大概如下:

4.3.1、cpufreq core & cpufreq driver

cpufreq core层次最核心的就是每个cpu有一个自己的cpufreq_policy policy,放在per_cpu(cpufreq_cpu_data, cpu)变量中。实际上cpufreq_policy是一个cluster对应一个的,因为在现有的架构中,同一个cluster cpu都是同一个频率,所以同cluster中所有cpu的per_cpu(cpufreq_cpu_data, cpu)都指向同一个cpufreq_policy。

4.3.1.1、cpufreq_policy policy初始化

struct cpufreq_policy {/* CPUs sharing clock, require sw coordination */cpumask_var_t       cpus;   /* Online CPUs only */cpumask_var_t       related_cpus; /* Online + Offline CPUs */cpumask_var_t       real_cpus; /* Related and present */unsigned int        shared_type; /* ACPI: ANY or ALL affected CPUsshould set cpufreq */unsigned int        cpu;    /* cpu managing this policy, must be online */struct clk      *clk;struct cpufreq_cpuinfo  cpuinfo;/* see above */unsigned int        min;    /* in kHz */unsigned int        max;    /* in kHz */unsigned int        cur;    /* in kHz, only needed if cpufreq* governors are used */unsigned int        restore_freq; /* = policy->cur before transition */unsigned int        suspend_freq; /* freq to set during suspend */unsigned int        policy; /* see above */unsigned int        last_policy; /* policy before unplug */struct cpufreq_governor *governor; /* see below */void            *governor_data;bool            governor_enabled; /* governor start/stop flag */char            last_governor[CPUFREQ_NAME_LEN]; /* last governor used */struct work_struct  update; /* if update_policy() needs to be* called, but you're in IRQ context */struct cpufreq_user_policy user_policy;struct cpufreq_frequency_table  *freq_table;struct list_head        policy_list;struct kobject      kobj;struct completion   kobj_unregister;/** The rules for this semaphore:* - Any routine that wants to read from the policy structure will*   do a down_read on this semaphore.* - Any routine that will write to the policy structure and/or may take away*   the policy altogether (eg. CPU hotplug), will hold this lock in write*   mode before doing so.** Additional rules:* - Lock should not be held across*     __cpufreq_governor(data, CPUFREQ_GOV_POLICY_EXIT);*/struct rw_semaphore rwsem;/* Synchronization for frequency transitions */bool            transition_ongoing; /* Tracks transition status */spinlock_t      transition_lock;wait_queue_head_t   transition_wait;struct task_struct  *transition_task; /* Task which is doing the transition *//* cpufreq-stats */struct cpufreq_stats    *stats;/* For cpufreq driver's internal use */void            *driver_data;
}

在系统初始化化的时候初始化online cpu的cpufreq_policy,cpu在hotplug online的时候也会重新初始化cpufreq_policy。

  • 1、在mtk的cpufreq_driver驱动初始化函数_mt_cpufreq_pdrv_probe()中注册了_mt_cpufreq_driver:
static int _mt_cpufreq_pdrv_probe(struct platform_device *pdev)
{/* 注册cpufreq_driver */cpufreq_register_driver(&_mt_cpufreq_driver);/* 注册ppm的回调 */mt_ppm_register_client(PPM_CLIENT_DVFS, &ppm_limit_callback);}static struct cpufreq_driver _mt_cpufreq_driver = {.flags = CPUFREQ_ASYNC_NOTIFICATION,.verify = _mt_cpufreq_verify,.target = _mt_cpufreq_target,.init = _mt_cpufreq_init,.exit = _mt_cpufreq_exit,.get = _mt_cpufreq_get,.name = "mt-cpufreq",.attr = _mt_cpufreq_attr,
};
  • 2、在驱动注册cpufreq_register_driver()过程中会初始化online cpu的cpufreq_policy:
_mt_cpufreq_pdrv_probe() -> cpufreq_register_driver() -> subsys_interface_register() -> cpufreq_add_dev() -> cpufreq_online()↓static int cpufreq_online(unsigned int cpu)
{struct cpufreq_policy *policy;bool new_policy;unsigned long flags;unsigned int j;int ret;pr_debug("%s: bringing CPU%u online\n", __func__, cpu);/* (1) 检查per_cpu(cpufreq_cpu_data, cpu)中的cpufreq_policy, 如果为NULL,重新分配空间*//* Check if this CPU already has a policy to manage it */policy = per_cpu(cpufreq_cpu_data, cpu);if (policy) {WARN_ON(!cpumask_test_cpu(cpu, policy->related_cpus));if (!policy_is_inactive(policy))return cpufreq_add_policy_cpu(policy, cpu);/* This is the only online CPU for the policy.  Start over. */new_policy = false;down_write(&policy->rwsem);policy->cpu = cpu;policy->governor = NULL;up_write(&policy->rwsem);} else {new_policy = true;policy = cpufreq_policy_alloc(cpu);if (!policy)return -ENOMEM;}cpumask_copy(policy->cpus, cpumask_of(cpu));/* (2) 调用cpufreq_driver的初始化函数来初始化cpufreq_policy, 这步比较重要,初始化了以下的数据:*//* call driver. From then on the cpufreq must be able* to accept all calls to ->verify and ->setpolicy for this CPU*/ret = cpufreq_driver->init(policy);if (ret) {pr_debug("initialization failed\n");goto out_free_policy;}down_write(&policy->rwsem);/* (3) 如果cpufreq_policy是新分配空间的,做一些相应的初始化工作*/if (new_policy) {/* related_cpus should at least include policy->cpus. */cpumask_copy(policy->related_cpus, policy->cpus);/* Remember CPUs present at the policy creation time. */cpumask_and(policy->real_cpus, policy->cpus, cpu_present_mask);/* Name and add the kobject */ret = kobject_add(&policy->kobj, cpufreq_global_kobject,"policy%u",cpumask_first(policy->related_cpus));if (ret) {pr_err("%s: failed to add policy->kobj: %d\n", __func__,ret);goto out_exit_policy;}}/** affected cpus must always be the one, which are online. We aren't* managing offline cpus here.*/cpumask_and(policy->cpus, policy->cpus, cpu_online_mask);if (new_policy) {policy->user_policy.min = policy->min;policy->user_policy.max = policy->max;write_lock_irqsave(&cpufreq_driver_lock, flags);/* (3.1) 同一个cluster中所有cpu的per_cpu(cpufreq_cpu_data, j),共享同一个cpufreq_policy */for_each_cpu(j, policy->related_cpus)per_cpu(cpufreq_cpu_data, j) = policy;write_unlock_irqrestore(&cpufreq_driver_lock, flags);}/* (4) 获取cpufreq_policy的当前频率*/if (cpufreq_driver->get && !cpufreq_driver->setpolicy) {policy->cur = cpufreq_driver->get(policy->cpu);if (!policy->cur) {pr_err("%s: ->get() failed\n", __func__);goto out_exit_policy;}}/** Sometimes boot loaders set CPU frequency to a value outside of* frequency table present with cpufreq core. In such cases CPU might be* unstable if it has to run on that frequency for long duration of time* and so its better to set it to a frequency which is specified in* freq-table. This also makes cpufreq stats inconsistent as* cpufreq-stats would fail to register because current frequency of CPU* isn't found in freq-table.** Because we don't want this change to effect boot process badly, we go* for the next freq which is >= policy->cur ('cur' must be set by now,* otherwise we will end up setting freq to lowest of the table as 'cur'* is initialized to zero).** We are passing target-freq as "policy->cur - 1" otherwise* __cpufreq_driver_target() would simply fail, as policy->cur will be* equal to target-freq.*/if ((cpufreq_driver->flags & CPUFREQ_NEED_INITIAL_FREQ_CHECK)&& has_target()) {/* Are we running at unknown frequency ? */ret = cpufreq_frequency_table_get_index(policy, policy->cur);if (ret == -EINVAL) {/* Warn user and fix it */pr_warn("%s: CPU%d: Running at unlisted freq: %u KHz\n",__func__, policy->cpu, policy->cur);ret = __cpufreq_driver_target(policy, policy->cur - 1,CPUFREQ_RELATION_L);/** Reaching here after boot in a few seconds may not* mean that system will remain stable at "unknown"* frequency for longer duration. Hence, a BUG_ON().*/BUG_ON(ret);pr_warn("%s: CPU%d: Unlisted initial frequency changed to: %u KHz\n",__func__, policy->cpu, policy->cur);}}blocking_notifier_call_chain(&cpufreq_policy_notifier_list,CPUFREQ_START, policy);if (new_policy) {ret = cpufreq_add_dev_interface(policy);if (ret)goto out_exit_policy;blocking_notifier_call_chain(&cpufreq_policy_notifier_list,CPUFREQ_CREATE_POLICY, policy);write_lock_irqsave(&cpufreq_driver_lock, flags);list_add(&policy->policy_list, &cpufreq_policy_list);write_unlock_irqrestore(&cpufreq_driver_lock, flags);}/* (5) 调用cpufreq governor的初始化函数,来初始化cpufreq_policy*/ret = cpufreq_init_policy(policy);if (ret) {pr_err("%s: Failed to initialize policy for cpu: %d (%d)\n",__func__, cpu, ret);/* cpufreq_policy_free() will notify based on this */new_policy = false;goto out_exit_policy;}up_write(&policy->rwsem);kobject_uevent(&policy->kobj, KOBJ_ADD);/* Callback for handling stuff after policy is ready */if (cpufreq_driver->ready)cpufreq_driver->ready(policy);pr_debug("initialization complete\n");return 0;out_exit_policy:up_write(&policy->rwsem);if (cpufreq_driver->exit)cpufreq_driver->exit(policy);
out_free_policy:cpufreq_policy_free(policy, !new_policy);return ret;
}|→static int _mt_cpufreq_init(struct cpufreq_policy *policy)
{int ret = -EINVAL;unsigned long flags;FUNC_ENTER(FUNC_LV_MODULE);policy->shared_type = CPUFREQ_SHARED_TYPE_ANY;cpumask_setall(policy->cpus);policy->cpuinfo.transition_latency = 1000;{enum mt_cpu_dvfs_id id = _get_cpu_dvfs_id(policy->cpu);struct mt_cpu_dvfs *p = id_to_cpu_dvfs(id);unsigned int lv = _mt_cpufreq_get_cpu_level();struct opp_tbl_info *opp_tbl_info;struct opp_tbl_m_info *opp_tbl_m_info;struct opp_tbl_m_info *opp_tbl_m_cci_info;struct mt_cpu_dvfs *p_cci;cpufreq_ver("DVFS: _mt_cpufreq_init: %s(cpu_id = %d)\n", cpu_dvfs_get_name(p), p->cpu_id);opp_tbl_info = &opp_tbls[id][lv];p->cpu_level = lv;/* (2.1) 给policy->freq_table赋值 给policy->cpus赋值给policy->related_cpus赋值*/ret = _mt_cpufreq_setup_freqs_table(policy,opp_tbl_info->opp_tbl, opp_tbl_info->size);/* (2.2) 给policy->cpuinfo.max_freq赋值 给policy->cpuinfo.min_freq赋值*/policy->cpuinfo.max_freq = cpu_dvfs_get_max_freq(p);policy->cpuinfo.min_freq = cpu_dvfs_get_min_freq(p);opp_tbl_m_info = &opp_tbls_m[id][lv];p->freq_tbl = opp_tbl_m_info->opp_tbl_m;cpufreq_lock(flags);/* Sync p */if (_mt_cpufreq_sync_opp_tbl_idx(p) >= 0)if (p->idx_normal_max_opp == -1)p->idx_normal_max_opp = p->idx_opp_tbl;/* (2.3) 给policy->cur赋值 给policy->max赋值给policy->min赋值*/policy->cur = cpu_dvfs_get_cur_freq(p); /* use cur phy freq is better */policy->max = cpu_dvfs_get_freq_by_idx(p, p->idx_opp_ppm_limit);policy->min = cpu_dvfs_get_freq_by_idx(p, p->idx_opp_ppm_base);p->mt_policy = policy;p->armpll_is_available = 1;#ifdef CONFIG_HYBRID_CPU_DVFSif (turbo_flag && cpu_dvfs_is(p, MT_CPU_DVFS_B) && !turbo_is_inited) {unsigned int turbo_f, turbo_v;turbo_f = ((cpu_dvfs_get_max_freq(p) * 104 / 100) / 13) * 13 / 1000;if (picachu_need_higher_volt(MT_PICACHU_DOMAIN2))turbo_v = MAX_VPROC_VOLT;elseturbo_v = MAX_VPROC_VOLT - 2000;/* turbo_v = p->opp_tbl[0].cpufreq_volt; */cpuhvfs_set_turbo_scale(turbo_f * 1000, turbo_v);turbo_is_inited = 1;}
#endif/* Sync cci */if (cci_is_inited == 0) {p_cci = id_to_cpu_dvfs(MT_CPU_DVFS_CCI);/* init cci freq idx */if (_mt_cpufreq_sync_opp_tbl_idx(p_cci) >= 0)if (p_cci->idx_normal_max_opp == -1)p_cci->idx_normal_max_opp = p_cci->idx_opp_tbl;opp_tbl_m_cci_info = &opp_tbls_m[MT_CPU_DVFS_CCI][lv];p_cci->freq_tbl = opp_tbl_m_cci_info->opp_tbl_m;p_cci->mt_policy = NULL;p_cci->armpll_is_available = 1;cci_is_inited = 1;}
#ifdef CONFIG_HYBRID_CPU_DVFScpuhvfs_set_cluster_on_off(arch_get_cluster_id(p->cpu_id), 1);
#endifcpufreq_unlock(flags);}if (ret)cpufreq_err("failed to setup frequency table\n");FUNC_EXIT(FUNC_LV_MODULE);return ret;
}||→static int _mt_cpufreq_setup_freqs_table(struct cpufreq_policy *policy,struct mt_cpu_freq_info *freqs, int num)
{struct mt_cpu_dvfs *p;int ret = 0;FUNC_ENTER(FUNC_LV_LOCAL);p = id_to_cpu_dvfs(_get_cpu_dvfs_id(policy->cpu));#ifdef CONFIG_CPU_FREQret = cpufreq_frequency_table_cpuinfo(policy, p->freq_tbl_for_cpufreq);/* (2.1.1) 给policy->freq_table赋值 */if (!ret)policy->freq_table = p->freq_tbl_for_cpufreq;/* (2.1.2) 根据cpu相同cluster中有哪些cpu 给policy->cpus赋值给policy->related_cpus赋值*/cpumask_copy(policy->cpus, topology_core_cpumask(policy->cpu));cpumask_copy(policy->related_cpus, policy->cpus);
#endifFUNC_EXIT(FUNC_LV_LOCAL);return 0;
}
  • 3、在cpufreq_online()初始化完cpufreq_policy,最后会调用cpufreq_init_policy()继续governor的初始化:
static int cpufreq_init_policy(struct cpufreq_policy *policy)
{struct cpufreq_governor *gov = NULL;struct cpufreq_policy new_policy;memcpy(&new_policy, policy, sizeof(*policy));/* (5.1) 使用last或者default的governor,给new_policy.governor赋值*//* Update governor of new_policy to the governor used before hotplug */gov = find_governor(policy->last_governor);if (gov)pr_debug("Restoring governor %s for cpu %d\n",policy->governor->name, policy->cpu);elsegov = CPUFREQ_DEFAULT_GOVERNOR;new_policy.governor = gov;/* Use the default policy if there is no last_policy. */if (cpufreq_driver->setpolicy) {if (policy->last_policy)new_policy.policy = policy->last_policy;elsecpufreq_parse_governor(gov->name, &new_policy.policy,NULL);}/* (5.2) 启动governor来使用cpufreq_policy *//* set default policy */return cpufreq_set_policy(policy, &new_policy);
}|→static int cpufreq_set_policy(struct cpufreq_policy *policy,struct cpufreq_policy *new_policy)
{struct cpufreq_governor *old_gov;int ret;pr_debug("setting new policy for CPU %u: %u - %u kHz\n",new_policy->cpu, new_policy->min, new_policy->max);memcpy(&new_policy->cpuinfo, &policy->cpuinfo, sizeof(policy->cpuinfo));/* (5.2.1) 对policy、new_policy的一堆合法性判断 *//** This check works well when we store new min/max freq attributes,* because new_policy is a copy of policy with one field updated.*/if (new_policy->min > new_policy->max)return -EINVAL;/* verify the cpu speed can be set within this limit */ret = cpufreq_driver->verify(new_policy);if (ret)return ret;/* adjust if necessary - all reasons */blocking_notifier_call_chain(&cpufreq_policy_notifier_list,CPUFREQ_ADJUST, new_policy);/** verify the cpu speed can be set within this limit, which might be* different to the first one*/ret = cpufreq_driver->verify(new_policy);if (ret)return ret;/* notification of the new policy */blocking_notifier_call_chain(&cpufreq_policy_notifier_list,CPUFREQ_NOTIFY, new_policy);scale_freq_capacity(new_policy, NULL);policy->min = new_policy->min;policy->max = new_policy->max;trace_cpu_frequency_limits(policy->max, policy->min, policy->cpu);pr_debug("new min and max freqs are %u - %u kHz\n",policy->min, policy->max);if (cpufreq_driver->setpolicy) {policy->policy = new_policy->policy;pr_debug("setting range\n");return cpufreq_driver->setpolicy(new_policy);}if (new_policy->governor == policy->governor)goto out;pr_debug("governor switch\n");/* (5.2.2) 如果旧的governor在工作中,依次调用 CPUFREQ_GOV_STOP、CPUFREQ_GOV_POLICY_EXIT停止旧的governor*//* save old, working values */old_gov = policy->governor;/* end old governor */if (old_gov) {ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP);if (ret) {/* This can happen due to race with other operations */pr_debug("%s: Failed to Stop Governor: %s (%d)\n",__func__, old_gov->name, ret);return ret;}up_write(&policy->rwsem);ret = __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);down_write(&policy->rwsem);if (ret) {pr_err("%s: Failed to Exit Governor: %s (%d)\n",__func__, old_gov->name, ret);return ret;}}/* (5.2.3) 依次调用 CPUFREQ_GOV_POLICY_INIT、CPUFREQ_GOV_START让新的governor开工*//* start new governor */policy->governor = new_policy->governor;ret = __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT);if (!ret) {ret = __cpufreq_governor(policy, CPUFREQ_GOV_START);if (!ret)goto out;up_write(&policy->rwsem);__cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);down_write(&policy->rwsem);}/* new governor failed, so re-start old one */pr_debug("starting governor %s failed\n", policy->governor->name);if (old_gov) {policy->governor = old_gov;if (__cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT))policy->governor = NULL;else__cpufreq_governor(policy, CPUFREQ_GOV_START);}return ret;out:pr_debug("governor: change or update limits\n");return __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
}||→static int __cpufreq_governor(struct cpufreq_policy *policy,unsigned int event)
{/* __cpufreq_governor()调用的各种命令最后调用的都是governor的具体函数 */ret = policy->governor->governor(policy, event);
}
  • 4、以interactive governor为例,说明policy->governor->governor()对CPUFREQ_GOV_POLICY_INIT、CPUFREQ_GOV_START、CPUFREQ_GOV_STOP、CPUFREQ_GOV_POLICY_EXIT这几个命令的实现:
struct cpufreq_governor cpufreq_gov_interactive = {.name = "interactive",.governor = cpufreq_governor_interactive,.max_transition_latency = 10000000,.owner = THIS_MODULE,
};↓static int cpufreq_governor_interactive(struct cpufreq_policy *policy,unsigned int event)
{int rc;unsigned int j;struct cpufreq_interactive_cpuinfo *pcpu;struct cpufreq_frequency_table *freq_table;struct cpufreq_interactive_tunables *tunables;unsigned long flags;if (have_governor_per_policy())tunables = policy->governor_data;elsetunables = common_tunables;WARN_ON(!tunables && (event != CPUFREQ_GOV_POLICY_INIT));switch (event) {/* (1) CPUFREQ_GOV_POLICY_INIT命令的实现:初始化tunables,tunables是interactive governor在计算时使用的各种参数相关的sysfs注册*/case CPUFREQ_GOV_POLICY_INIT:if (have_governor_per_policy()) {WARN_ON(tunables);} else if (tunables) {tunables->usage_count++;policy->governor_data = tunables;return 0;}tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);if (!tunables) {pr_err("%s: POLICY_INIT: kzalloc failed\n", __func__);return -ENOMEM;}tunables->usage_count = 1;tunables->above_hispeed_delay = default_above_hispeed_delay;tunables->nabove_hispeed_delay =ARRAY_SIZE(default_above_hispeed_delay);tunables->go_hispeed_load = DEFAULT_GO_HISPEED_LOAD;tunables->target_loads = default_target_loads;tunables->ntarget_loads = ARRAY_SIZE(default_target_loads);tunables->min_sample_time = DEFAULT_MIN_SAMPLE_TIME;tunables->timer_rate = DEFAULT_TIMER_RATE;tunables->boostpulse_duration_val = DEFAULT_MIN_SAMPLE_TIME;tunables->timer_slack_val = DEFAULT_TIMER_SLACK;spin_lock_init(&tunables->target_loads_lock);spin_lock_init(&tunables->above_hispeed_delay_lock);policy->governor_data = tunables;if (!have_governor_per_policy()) {common_tunables = tunables;}rc = sysfs_create_group(get_governor_parent_kobj(policy),get_sysfs_attr());if (rc) {kfree(tunables);policy->governor_data = NULL;if (!have_governor_per_policy()) {common_tunables = NULL;}return rc;}if (!policy->governor->initialized) {idle_notifier_register(&cpufreq_interactive_idle_nb);cpufreq_register_notifier(&cpufreq_notifier_block,CPUFREQ_TRANSITION_NOTIFIER);}break;/* (2) CPUFREQ_GOV_POLICY_EXIT命令的实现:remove相关的sysfs*/case CPUFREQ_GOV_POLICY_EXIT:if (!--tunables->usage_count) {if (policy->governor->initialized == 1) {cpufreq_unregister_notifier(&cpufreq_notifier_block,CPUFREQ_TRANSITION_NOTIFIER);idle_notifier_unregister(&cpufreq_interactive_idle_nb);}
#ifdef CONFIG_MEIZU_BSP}
#elsesysfs_remove_group(get_governor_parent_kobj(policy),get_sysfs_attr());kfree(tunables);common_tunables = NULL;}policy->governor_data = NULL;
#endif //CONFIG_MEIZU_BSPbreak;/* (3) CPUFREQ_GOV_START命令的实现:因为同一个cluster中的多个cpu是共享一个cpufreq_policy的,所以使用同一个cpufreq_policy来初始化cluster中多个online cpu的per_cpu(cpuinfo, j)变量:pcpu->target_freq    // 当前频率pcpu->freq_table     // 频率表并且启动cpu上的interactive_timer=pcpu->cpu_timer:cpufreq_interactive_timer_start(tunables, j);*/case CPUFREQ_GOV_START:mutex_lock(&gov_lock);freq_table = cpufreq_frequency_get_table(policy->cpu);if (tunables && !tunables->hispeed_freq)tunables->hispeed_freq = policy->max;for_each_cpu(j, policy->cpus) {pcpu = &per_cpu(cpuinfo, j);pcpu->policy = policy;pcpu->target_freq = policy->cur;pcpu->freq_table = freq_table;pcpu->floor_freq = pcpu->target_freq;pcpu->pol_floor_val_time =ktime_to_us(ktime_get());pcpu->loc_floor_val_time = pcpu->pol_floor_val_time;pcpu->pol_hispeed_val_time = pcpu->pol_floor_val_time;pcpu->loc_hispeed_val_time = pcpu->pol_floor_val_time;down_write(&pcpu->enable_sem);del_timer_sync(&pcpu->cpu_timer);del_timer_sync(&pcpu->cpu_slack_timer);cpufreq_interactive_timer_start(tunables, j);pcpu->governor_enabled = 1;up_write(&pcpu->enable_sem);}mutex_unlock(&gov_lock);break;/* (4) CPUFREQ_GOV_STOP命令的实现:如果同一个cluster中的多个cpu都已经offline,停掉对应的governor:停掉cpu上的interactive_timer=pcpu->cpu_timer*/case CPUFREQ_GOV_STOP:mutex_lock(&gov_lock);for_each_cpu(j, policy->cpus) {pcpu = &per_cpu(cpuinfo, j);down_write(&pcpu->enable_sem);pcpu->governor_enabled = 0;del_timer_sync(&pcpu->cpu_timer);del_timer_sync(&pcpu->cpu_slack_timer);up_write(&pcpu->enable_sem);}mutex_unlock(&gov_lock);break;case CPUFREQ_GOV_LIMITS:if (policy->max < policy->cur)__cpufreq_driver_target(policy,policy->max, CPUFREQ_RELATION_H);else if (policy->min > policy->cur)__cpufreq_driver_target(policy,policy->min, CPUFREQ_RELATION_L);for_each_cpu(j, policy->cpus) {pcpu = &per_cpu(cpuinfo, j);down_read(&pcpu->enable_sem);if (pcpu->governor_enabled == 0) {up_read(&pcpu->enable_sem);continue;}spin_lock_irqsave(&pcpu->target_freq_lock, flags);if (policy->max < pcpu->target_freq)pcpu->target_freq = policy->max;else if (policy->min > pcpu->target_freq)pcpu->target_freq = policy->min;spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);up_read(&pcpu->enable_sem);}break;}

4.3.1.2、cpufrep的频率配置

cpufreq一个重要的作用就是能把用户需要的cpu频率配置下去,这部分的代码也需要cpufreq core和cpufreq driver的配合。频率调整也叫DVFS(Dynamic Voltage and Frequency Scaling),需要按照对应关系把电压和频率一起配置下去。

具体的代码解析如下:

int __cpufreq_driver_target(struct cpufreq_policy *policy,unsigned int target_freq,unsigned int relation)
{unsigned int old_target_freq = target_freq;int retval = -EINVAL;if (cpufreq_disabled())return -ENODEV;/* (1) target目标频率在policy中的合法性检测 *//* Make sure that target_freq is within supported range */if (target_freq > policy->max)target_freq = policy->max;if (target_freq < policy->min)target_freq = policy->min;pr_debug("target for CPU %u: %u kHz, relation %u, requested %u kHz\n",policy->cpu, target_freq, relation, old_target_freq);/* (2) 如果当前频率就是target频率,不用调整直接返回 *//** This might look like a redundant call as we are checking it again* after finding index. But it is left intentionally for cases where* exactly same freq is called again and so we can save on few function* calls.*/if (target_freq == policy->cur)return 0;/* Save last value to restore later on errors */policy->restore_freq = policy->cur;if (cpufreq_driver->target)/* (3) 调用实际的驱动target()函数来调整cpu频率 */retval = cpufreq_driver->target(policy, target_freq, relation);else if (cpufreq_driver->target_index) {struct cpufreq_frequency_table *freq_table;int index;freq_table = cpufreq_frequency_get_table(policy->cpu);if (unlikely(!freq_table)) {pr_err("%s: Unable to find freq_table\n", __func__);goto out;}retval = cpufreq_frequency_table_target(policy, freq_table,target_freq, relation, &index);if (unlikely(retval)) {pr_err("%s: Unable to find matching freq\n", __func__);goto out;}if (freq_table[index].frequency == policy->cur) {retval = 0;goto out;}retval = __target_index(policy, freq_table, index);}out:return retval;
}|→static int _mt_cpufreq_target(struct cpufreq_policy *policy, unsigned int target_freq,unsigned int relation)
{struct mt_cpu_dvfs *p;int ret;unsigned int new_opp_idx;p = id_to_cpu_dvfs(_get_cpu_dvfs_id(policy->cpu));if (!p)return -EINVAL;/* (3.1) 驱动根据频率电压表,配置target频率和对应电压 */ret = cpufreq_frequency_table_target(policy, p->freq_tbl_for_cpufreq,target_freq, relation, &new_opp_idx);if (ret || new_opp_idx >= p->nr_opp_tbl)return -EINVAL;if (dvfs_disable_flag || p->dvfs_disable_by_suspend || p->dvfs_disable_by_procfs)return -EPERM;_mt_cpufreq_dvfs_request_wrapper(p, new_opp_idx, MT_CPU_DVFS_NORMAL, NULL);return 0;
}

4.3.2、interactive governor

在所有的cpufreq governor中最有名气的就是interactive governor了,因为几乎所有的andriod系统中都在使用。

interactive的思想就是使用cpu的负载来调整cpu频率,核心就是:使用一个20ms的定时器来计算cpu占用率,根据cpu占用率的不同threshold来调整不同档位的频率。

interactive的负载计算方法如上图所示。interactive的整个计算方法大概如下:

  • 1、计算cpu的累加负载。每20ms采样一次,每次采样统计增加的active_time和当前频率的乘积:cputime_speedadj += active_time * cur_freq;
  • 2、计算cpu的占用率。当前cpu占用率 = (累加负载100)/(累加时间当前频率),cpu_load = (loadadjfreq*100)/(delta_time*cur_freq);
  • 3、如果cpu_load达到高门限go_hispeed_load(99%)或者发生boost,直接调节频率到hispeed_freq(最高频率);
  • 4、其他情况下使用choose_freq()公式计算新频率:new_freq = cur_freq*(cpu_load/DEFAULT_TARGET_LOAD(90));new_freq = cpufreq_frequency_table_target(new_freq, CPUFREQ_RELATION_L);
  • 5、如果当前频率已经达到hispeed_freq,还需要往上调整,必须在之前的频率上保持above_hispeed_delay(20ms);如果当前频率已经达到hispeed_freq,还需要往下调整,必须在之前的频率上保持min_sample_time(80ms);

interactive governor从原理上看,有以下问题:

  • 1、20ms的采样时间过长,负载变化到频率调整的反应时间过长;
  • 2、负载累加计算有问题,历史负载没有老化机制,历史负载的权重和当前一样,造成当前的负载变化不真实;
  • 3、计算cpu占用率=总历史负载/(总时间*当前频率),算法不合理历史负载对当前影响太大。如果之前是高频率,现在变成低频率,那么cpu_load计算出来的值可能超过100%;如果之前是低频率,现在是高频率,那么cpu_load计算出来的值也会大大被拉低;
  • 4、choose_freq()的计算公式有重大漏洞。比如我们cpu频率表={800M, 900M},当前cur_freq=800m cur_load=100%,那么newfreq = (cur_freq*cur_load)/90 = 889M,使用CPUFREQ_RELATION_L选择档位,选择到还是800M根本不能向高档位前进。这是算法的一个漏洞,如果cpu不同档位的频率差值大于(100/90),那么正常往上调频是调不上去的,会被CPUFREQ_RELATION_L参数拦下来。所以实际的interactive调频,都是使用go_hispeed_load(99%)调到最高值的,再使用choose_freq()来降频。

所以interactive governor会逐渐的被cpufreq gorernor所取代。

4.3.2.1、interactive governor的初始化

  • 1、interactive的一部分初始化在cpufreq_interactive_init()当中:
static int __init cpufreq_interactive_init(void)
{unsigned int i;struct cpufreq_interactive_cpuinfo *pcpu;struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };/* (1) 初始化percpu变量per_cpu(cpuinfo, i): 每个cpu创建负载计算定时器pcpu->cpu_timer其他的锁*//* Initalize per-cpu timers */for_each_possible_cpu(i) {pcpu = &per_cpu(cpuinfo, i);init_timer_deferrable(&pcpu->cpu_timer);pcpu->cpu_timer.function = cpufreq_interactive_timer;pcpu->cpu_timer.data = i;init_timer(&pcpu->cpu_slack_timer);pcpu->cpu_slack_timer.function = cpufreq_interactive_nop_timer;spin_lock_init(&pcpu->load_lock);spin_lock_init(&pcpu->target_freq_lock);init_rwsem(&pcpu->enable_sem);}spin_lock_init(&speedchange_cpumask_lock);mutex_init(&gov_lock);/* (2) 创建频率调整进程speedchange_task, 把耗时的频率调整工作单独放到一个进程中去做*/speedchange_task =kthread_create(cpufreq_interactive_speedchange_task, NULL,"cfinteractive");if (IS_ERR(speedchange_task))return PTR_ERR(speedchange_task);sched_setscheduler_nocheck(speedchange_task, SCHED_FIFO, &param);get_task_struct(speedchange_task);/* NB: wake up so the thread does not look hung to the freezer */wake_up_process(speedchange_task);return cpufreq_register_governor(&cpufreq_gov_interactive);
}
  • 2、interactive另一部分初始化在cpufreq_governor_interactive()中的CPUFREQ_GOV_POLICY_INIT、CPUFREQ_GOV_START命令,在cpu online时执行:

static int cpufreq_governor_interactive(struct cpufreq_policy *policy,unsigned int event)
{switch (event) {/* (1)  CPUFREQ_GOV_POLICY_INIT命令初始化interactive governor最核心的参数*/case CPUFREQ_GOV_POLICY_INIT:if (have_governor_per_policy()) {WARN_ON(tunables);} else if (tunables) {tunables->usage_count++;policy->governor_data = tunables;return 0;}tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);if (!tunables) {pr_err("%s: POLICY_INIT: kzalloc failed\n", __func__);return -ENOMEM;}tunables->usage_count = 1;tunables->above_hispeed_delay = default_above_hispeed_delay;tunables->nabove_hispeed_delay =ARRAY_SIZE(default_above_hispeed_delay);tunables->go_hispeed_load = DEFAULT_GO_HISPEED_LOAD;tunables->target_loads = default_target_loads;tunables->ntarget_loads = ARRAY_SIZE(default_target_loads);tunables->min_sample_time = DEFAULT_MIN_SAMPLE_TIME;tunables->timer_rate = DEFAULT_TIMER_RATE;          // interactive负载计算timer默认时间为20mstunables->boostpulse_duration_val = DEFAULT_MIN_SAMPLE_TIME;tunables->timer_slack_val = DEFAULT_TIMER_SLACK;spin_lock_init(&tunables->target_loads_lock);spin_lock_init(&tunables->above_hispeed_delay_lock);policy->governor_data = tunables;if (!have_governor_per_policy()) {common_tunables = tunables;}rc = sysfs_create_group(get_governor_parent_kobj(policy),get_sysfs_attr());if (rc) {kfree(tunables);policy->governor_data = NULL;if (!have_governor_per_policy()) {common_tunables = NULL;}return rc;}if (!policy->governor->initialized) {idle_notifier_register(&cpufreq_interactive_idle_nb);cpufreq_register_notifier(&cpufreq_notifier_block,CPUFREQ_TRANSITION_NOTIFIER);}break;/* (2) CPUFREQ_GOV_START命令启动interactive负载计算的timer*/case CPUFREQ_GOV_START:mutex_lock(&gov_lock);freq_table = cpufreq_frequency_get_table(policy->cpu);if (tunables && !tunables->hispeed_freq)tunables->hispeed_freq = policy->max;for_each_cpu(j, policy->cpus) {pcpu = &per_cpu(cpuinfo, j);pcpu->policy = policy;pcpu->target_freq = policy->cur;pcpu->freq_table = freq_table;pcpu->floor_freq = pcpu->target_freq;pcpu->pol_floor_val_time =ktime_to_us(ktime_get());pcpu->loc_floor_val_time = pcpu->pol_floor_val_time;pcpu->pol_hispeed_val_time = pcpu->pol_floor_val_time;pcpu->loc_hispeed_val_time = pcpu->pol_floor_val_time;down_write(&pcpu->enable_sem);del_timer_sync(&pcpu->cpu_timer);del_timer_sync(&pcpu->cpu_slack_timer);cpufreq_interactive_timer_start(tunables, j);pcpu->governor_enabled = 1;up_write(&pcpu->enable_sem);}mutex_unlock(&gov_lock);break;}

4.3.2.2、interactive governor的算法

interactive governor的核心算法在20ms周期的timer interactive governor()中:

static void cpufreq_interactive_timer(unsigned long data)
{u64 now;unsigned int delta_time;u64 cputime_speedadj;int cpu_load;struct cpufreq_interactive_cpuinfo *pcpu =&per_cpu(cpuinfo, data);struct cpufreq_interactive_tunables *tunables =pcpu->policy->governor_data;unsigned int new_freq;unsigned int loadadjfreq;unsigned int index;unsigned long flags;u64 max_fvtime;int j;unsigned int max_t_freq = 0;#ifdef CPUDVFS_POWER_MODE/* default(normal), low power, just make, performance(sports) */int min_sample_t[4] = { 80, 20, 20, 80 };int ppb_idx;
#endifif (!down_read_trylock(&pcpu->enable_sem))return;if (!pcpu->governor_enabled)goto exit;spin_lock_irqsave(&pcpu->load_lock, flags);/* (1) 累加cpu上自从cpu_up()以来的负载,pcpu->cputime_speedadj += active_time * pcpu->policy->cur;pcpu->cputime_speedadj = (active_time * pcpu->policy->cur)samp1 + ... +(active_time * pcpu->policy->cur)sampn ;每个采样周期为20mS,累加:第1个20ms中active_time*cur_cpu_freq + 第2个20ms中active_time*cur_cpu_freq +...+ 第n个20ms中active_time*cur_cpu_freq*/now = update_load(data);/* (2) 自从cpu_up()以来的总的时间delta_time = active_time + ilde_time*/delta_time = (unsigned int)(now - pcpu->cputime_speedadj_timestamp);cputime_speedadj = pcpu->cputime_speedadj;spin_unlock_irqrestore(&pcpu->load_lock, flags);if (WARN_ON_ONCE(!delta_time))goto rearm;spin_lock_irqsave(&pcpu->target_freq_lock, flags);/* (3) 总的负载/总时间 = 平均频率 */do_div(cputime_speedadj, delta_time);/* (4) (平均频率 * 100)/当前频率 = 当前cpu的占用率 */loadadjfreq = (unsigned int)cputime_speedadj * 100;cpu_load = loadadjfreq / pcpu->policy->cur;tunables->boosted = tunables->boost_val || now < tunables->boostpulse_endtime;#ifdef CPUDVFS_POWER_MODEppb_idx = mt_cpufreq_get_ppb_state();{unsigned int idx = mt_cpufreq_ppb_hispeed_freq(data, ppb_idx);tunables->hispeed_freq = pcpu->freq_table[idx].frequency;tunables->min_sample_time = min_sample_t[ppb_idx] * USEC_PER_MSEC;if (hispeed_freq_perf != 0)tunables->hispeed_freq = hispeed_freq_perf;if (min_sample_time_perf != 0)tunables->min_sample_time = min_sample_time_perf;}
#endif/* (5) 如果cpu占用率达到go_hispeed_load(99%),或者在boost状态,频率直接调整到最高频率hispeed_freq*/if (cpu_load >= tunables->go_hispeed_load || tunables->boosted) {if (pcpu->policy->cur < tunables->hispeed_freq) {new_freq = tunables->hispeed_freq;} else {new_freq = choose_freq(pcpu, loadadjfreq);if (new_freq < tunables->hispeed_freq)new_freq = tunables->hispeed_freq;}/* (6) 否则使用choose_freq()根据当前负载来计算对应的频率*/} else {new_freq = choose_freq(pcpu, loadadjfreq);if (new_freq > tunables->hispeed_freq &&pcpu->policy->cur < tunables->hispeed_freq)new_freq = tunables->hispeed_freq;}/* (7) 如果计算出的新频率 > hispeed_freq,不能马上调整,在hispeed_freq以上的频率上必须待满above_hispeed_delay(20ms),才能继续往上调整频率*/if (pcpu->policy->cur >= tunables->hispeed_freq &&new_freq > pcpu->policy->cur &&now - pcpu->pol_hispeed_val_time <freq_to_above_hispeed_delay(tunables, pcpu->policy->cur)) {trace_cpufreq_interactive_notyet(data, cpu_load, pcpu->target_freq,pcpu->policy->cur, new_freq);spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);goto rearm;}pcpu->loc_hispeed_val_time = now;if (cpufreq_frequency_table_target(pcpu->policy, pcpu->freq_table,new_freq, CPUFREQ_RELATION_L,&index)) {spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);goto rearm;}new_freq = pcpu->freq_table[index].frequency;/* (8) 如果之前的频率 > hispeed_freq,或者发生boost现在需要往低调频,之前的频率需要待满min_sample_time(80ms)*//** Do not scale below floor_freq unless we have been at or above the* floor frequency for the minimum sample time since last validated.*/max_fvtime = max(pcpu->pol_floor_val_time, pcpu->loc_floor_val_time);if (new_freq < pcpu->floor_freq &&pcpu->target_freq >= pcpu->policy->cur) {if (now - max_fvtime < tunables->min_sample_time) {trace_cpufreq_interactive_notyet(data, cpu_load, pcpu->target_freq,pcpu->policy->cur, new_freq);spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);goto rearm;}}/** Update the timestamp for checking whether speed has been held at* or above the selected frequency for a minimum of min_sample_time,* if not boosted to hispeed_freq.  If boosted to hispeed_freq then we* allow the speed to drop as soon as the boostpulse duration expires* (or the indefinite boost is turned off).*/if (!tunables->boosted || new_freq > tunables->hispeed_freq) {pcpu->floor_freq = new_freq;if (pcpu->target_freq >= pcpu->policy->cur ||new_freq >= pcpu->policy->cur)pcpu->loc_floor_val_time = now;}/* (9) 如果当前cpu往低调整频率,判断当前policy是否需要更新,因为多个cpu共享一个policy,取最大期望频率cpu的值作为整个policy的调整值*/if (pcpu->target_freq == new_freq &&pcpu->target_freq <= pcpu->policy->cur) {max_t_freq = 0;for_each_cpu(j, pcpu->policy->cpus) {struct cpufreq_interactive_cpuinfo *pjcpu;pjcpu = &per_cpu(cpuinfo, j);max_t_freq = max(max_t_freq, pjcpu->target_freq);}if (max_t_freq != pcpu->policy->cur)goto pass_t;trace_cpufreq_interactive_already(data, cpu_load, pcpu->target_freq,pcpu->policy->cur, new_freq);spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);goto rearm;}
pass_t:trace_cpufreq_interactive_target(data, cpu_load, pcpu->target_freq,pcpu->policy->cur, new_freq);/* (10) 如果policy需要更新唤醒speedchange_task来执行调频动作 */pcpu->target_freq = new_freq;spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);spin_lock_irqsave(&speedchange_cpumask_lock, flags);cpumask_set_cpu(data, &speedchange_cpumask);spin_unlock_irqrestore(&speedchange_cpumask_lock, flags);wake_up_process(speedchange_task);rearm:if (!timer_pending(&pcpu->cpu_timer))cpufreq_interactive_timer_resched(pcpu);exit:up_read(&pcpu->enable_sem);return;
}|→static unsigned int choose_freq(struct cpufreq_interactive_cpuinfo *pcpu,unsigned int loadadjfreq)
{unsigned int freq = pcpu->policy->cur;unsigned int prevfreq, freqmin, freqmax;unsigned int tl;int index;freqmin = 0;freqmax = UINT_MAX;do {prevfreq = freq;/* (6.1) tl = 90,loadadjfreq = (平均频率 * 100)即 newfreq =  (平均频率 * 100)/ 90相当于cpufreq_frequency_table_target(CPUFREQ_RELATION_L),相当于newfreq往低档位的计算,ooooo这里带来一个非常严重的问题,如果档位之间差值大于100/90,向上调频将调不上去*/tl = freq_to_targetload(pcpu->policy->governor_data, freq);/** Find the lowest frequency where the computed load is less* than or equal to the target load.*/if (cpufreq_frequency_table_target(pcpu->policy, pcpu->freq_table, loadadjfreq / tl,CPUFREQ_RELATION_L, &index))break;freq = pcpu->freq_table[index].frequency;if (freq > prevfreq) {/* The previous frequency is too low. */freqmin = prevfreq;if (freq >= freqmax) {/** Find the highest frequency that is less* than freqmax.*/if (cpufreq_frequency_table_target(pcpu->policy, pcpu->freq_table,freqmax - 1, CPUFREQ_RELATION_H,&index))break;freq = pcpu->freq_table[index].frequency;if (freq == freqmin) {/** The first frequency below freqmax* has already been found to be too* low.  freqmax is the lowest speed* we found that is fast enough.*/freq = freqmax;break;}}} else if (freq < prevfreq) {/* The previous frequency is high enough. */freqmax = prevfreq;if (freq <= freqmin) {/** Find the lowest frequency that is higher* than freqmin.*/if (cpufreq_frequency_table_target(pcpu->policy, pcpu->freq_table,freqmin + 1, CPUFREQ_RELATION_L,&index))break;freq = pcpu->freq_table[index].frequency;/** If freqmax is the first frequency above* freqmin then we have already found that* this speed is fast enough.*/if (freq == freqmax)break;}}/* If same frequency chosen as previous then done. */} while (freq != prevfreq);return freq;
}

4.4、cpu hotplug调整

还有一种调节负载的方式是cpu hotplug:

  • 1、cpu被hotplug掉的功耗小于cpu进入idle的功耗;如果整个cluster的cpu都offline,cluster也可以poweroff;所以hotplug能够节省功耗;
  • 2、但是hotplug是有开销的:hotplug动作在速度慢的时候达到了ms级别,另外进程的迁移也是有开销的;cpu的hotplug必须遵循顺序插拔的规则,如果先拔掉负载重的cpu也是不合理的;
  • 3、MTK的技术限制必须使用hotplug:MTK平台只有在剩一个online cpu的情况下才能进入深度idle模式,所以MTK平台必须支持hotplug;而samsung、qualcomm在多核online的情况下可以进入深度idle,所以一般不支持cpu hotplug;

4.4.1、hotplug 底层实现

4.4.1.1、cpu_cup()/cpu_down()

kernel对hotplug的支持是很完善的,标准接口cpu_up()/cpu_down()可以进行hotplug。

4.4.1.2、hotplug 进程迁移

在cpu_down()时,需要调用migration_call() -> migrate_tasks()把cpu上所有runnable进程迁移到其他cpu;在cpu_up()时,并不需要在函数中迁移进程,直接等待负载均衡算法的迁移。

static void migrate_tasks(struct rq *dead_rq)
{struct rq *rq = dead_rq;struct task_struct *next, *stop = rq->stop;int dest_cpu;/** Fudge the rq selection such that the below task selection loop* doesn't get stuck on the currently eligible stop task.** We're currently inside stop_machine() and the rq is either stuck* in the stop_machine_cpu_stop() loop, or we're executing this code,* either way we should never end up calling schedule() until we're* done here.*/rq->stop = NULL;/** put_prev_task() and pick_next_task() sched* class method both need to have an up-to-date* value of rq->clock[_task]*/update_rq_clock(rq);unthrottle_offline_rt_rqs(rq);for (;;) {/** There's this thread running, bail when that's the only* remaining thread.*/if (rq->nr_running == 1)break;/* (1) 逐个从rq中获取task = next *//** pick_next_task assumes pinned rq->lock.*/lockdep_pin_lock(&rq->lock);next = pick_next_task(rq, &fake_task);BUG_ON(!next);next->sched_class->put_prev_task(rq, next);/** Rules for changing task_struct::cpus_allowed are holding* both pi_lock and rq->lock, such that holding either* stabilizes the mask.** Drop rq->lock is not quite as disastrous as it usually is* because !cpu_active at this point, which means load-balance* will not interfere. Also, stop-machine.*/lockdep_unpin_lock(&rq->lock);raw_spin_unlock(&rq->lock);raw_spin_lock(&next->pi_lock);raw_spin_lock(&rq->lock);/** Since we're inside stop-machine, _nothing_ should have* changed the task, WARN if weird stuff happened, because in* that case the above rq->lock drop is a fail too.*/if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {raw_spin_unlock(&next->pi_lock);continue;}/* (2) 找到最适合next进程迁移的目的cpu *//* Find suitable destination for @next, with force if needed. */dest_cpu = select_fallback_rq(dead_rq->cpu, next);/* (3) 实施进程迁移 */rq = __migrate_task(rq, next, dest_cpu);if (rq != dead_rq) {raw_spin_unlock(&rq->lock);rq = dead_rq;raw_spin_lock(&rq->lock);}raw_spin_unlock(&next->pi_lock);}rq->stop = stop;
}|→static int select_fallback_rq(int cpu, struct task_struct *p)
{int nid = cpu_to_node(cpu);const struct cpumask *nodemask = NULL;enum { cpuset, possible, fail } state = cpuset;int dest_cpu;/** If the node that the cpu is on has been offlined, cpu_to_node()* will return -1. There is no cpu on the node, and we should* select the cpu on the other node.*/if (nid != -1) {nodemask = cpumask_of_node(nid);/* Look for allowed, online CPU in same node. */for_each_cpu(dest_cpu, nodemask) {if (!cpu_online(dest_cpu))continue;if (!cpu_active(dest_cpu))continue;if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
                return dest_cpu;}}for (;;) {/* (2.1) 最好的情况:在tsk_cpus_allowed(p)中能找到online cpu迁移 *//* Any allowed, online CPU? */for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {if (!cpu_online(dest_cpu))continue;if (!cpu_active(dest_cpu))continue;goto out;}/* No more Mr. Nice Guy. */switch (state) {/* (2.2) 其次的情况:在cpuset中能找到online cpu迁移 */case cpuset:if (IS_ENABLED(CONFIG_CPUSETS)) {cpuset_cpus_allowed_fallback(p);state = possible;break;}/* (2.3) 最差的情况:在系统所有cpu中能找到online cpu迁移 *//* fall-through */case possible:do_set_cpus_allowed(p, cpu_possible_mask);state = fail;break;case fail:BUG();break;}}out:if (state != cpuset) {/** Don't tell them about moving exiting tasks or* kernel threads (both mm NULL), since they never* leave kernel.*/if (p->mm && printk_ratelimit()) {printk_deferred("process %d (%s) no longer affine to cpu%d\n",task_pid_nr(p), p->comm, cpu);}}
return dest_cpu;
}

4.4.2、MTK hotplug算法

在有了hotplug的底层cpu_cup()、cpu_down()的实现以后,在此之上还需要有一套算法根据cpu的负载来动态hotplug。MTK这套算法比较齐全,主要分为HICA、hps_algo_main两部分。

4.4.2.1、HICA/PPM

HICA和hps的关系,其实是HICA决定了一种大的mode,而hps在大的mode中实现精细化的调整。

比如对MT6799 HICA支持3种模式:

  • 1、LL_ONLY。 // 只开小核
  • 2、L_ONLY。 // 只开中核
  • 3、ALL。 // LL、L、B10核都可以使用

HICA在mt_ppm_hica_update_algo_data()中计算负载,根据负载变化来决定mode:

_hps_task_main() -> mt_ppm_hica_update_algo_data()↓void mt_ppm_hica_update_algo_data(unsigned int cur_loads,unsigned int cur_nr_heavy_task, unsigned int cur_tlp)
{struct ppm_power_state_data *state_info = ppm_get_power_state_info();struct ppm_state_transfer_data *data;enum ppm_power_state cur_state;enum ppm_mode cur_mode;int i, j;FUNC_ENTER(FUNC_LV_HICA);ppm_lock(&hica_policy.lock);ppm_hica_algo_data.ppm_cur_loads = cur_loads;ppm_hica_algo_data.ppm_cur_tlp = cur_tlp;ppm_hica_algo_data.ppm_cur_nr_heavy_task = cur_nr_heavy_task;cur_state = ppm_hica_algo_data.cur_state;cur_mode = ppm_main_info.cur_mode;ppm_dbg(HICA, "cur_loads = %d, cur_tlp = %d, cur_nr_heavy_task = %d, cur_state = %s, cur_mode = %d\n",cur_loads, cur_tlp, cur_nr_heavy_task, ppm_get_power_state_name(cur_state), cur_mode);if (!ppm_main_info.is_enabled || !hica_policy.is_enabled || ppm_main_info.is_in_suspend ||cur_state == PPM_POWER_STATE_NONE)goto end;#if defined(CONFIG_MACH_MT6757) || defined(CONFIG_MACH_KIBOPLUS)if (setup_max_cpus == 4)goto end;
#endif#ifdef PPM_IC_SEGMENT_CHECKif (ppm_main_info.fix_state_by_segment != PPM_POWER_STATE_NONE)goto end;
#endif/* skip HICA if DVFS is not ready (we cannot get current freq...) */if (!ppm_main_info.client_info[PPM_CLIENT_DVFS].limit_cb)goto end;/* Power state is fixed by user, skip HICA state calculation */if (fix_power_state != PPM_POWER_STATE_NONE)goto end;/* (1) 从transfer_by_perf到transfer_by_pwr逐个遍历判断当前state是否需要改变 */for (i = 0; i < 2; i++) {data = (i == 0) ? state_info[cur_state].transfer_by_perf: state_info[cur_state].transfer_by_pwr;/* (2) 如果当前state有几种变化逐个遍历,比如:当前state为ALL,可以ALL -> LL_ONLY也可以ALL -> L_ONLY*/for (j = 0; j < data->size; j++) {if (!data->transition_data[j].transition_rule|| !((1 << cur_mode) & data->transition_data[j].mode_mask))continue;/* (3) 如果state变化,获取新的state返回 */if (data->transition_data[j].transition_rule(ppm_hica_algo_data, &data->transition_data[j])) {ppm_hica_algo_data.new_state = data->transition_data[j].next_state;ppm_dbg(HICA, "[%s(%d)] Need state transfer: %s --> %s\n",(i == 0) ? "PERF" : "PWR",j,ppm_get_power_state_name(cur_state),ppm_get_power_state_name(ppm_hica_algo_data.new_state));goto end;/* (4) 如果state不变化,维持当前state,继续遍历*/} else {ppm_hica_algo_data.new_state = cur_state;
#ifdef PPM_HICA_2P0ppm_dbg(HICA, "[%s(%d)]hold in %s state, capacity_hold_cnt = %d, bigtsk_hold_cnt = %d, freq_hold_cnt = %d\n",(i == 0) ? "PERF" : "PWR",j,ppm_get_power_state_name(cur_state),data->transition_data[j].capacity_hold_cnt,data->transition_data[j].bigtsk_hold_cnt,data->transition_data[j].freq_hold_cnt);
#else
#if PPM_HICA_VARIANT_SUPPORTppm_dbg(HICA, "[%s(%d)]hold in %s state, loading_cnt = %d, freq_cnt = %d, overutil_l_hold_cnt = %d, .overutil_h_hold_cnt = %d\n",(i == 0) ? "PERF" : "PWR",j,ppm_get_power_state_name(cur_state),data->transition_data[j].loading_hold_cnt,data->transition_data[j].freq_hold_cnt,data->transition_data[j].overutil_l_hold_cnt,data->transition_data[j].overutil_h_hold_cnt);
#elseppm_dbg(HICA, "[%s(%d)]hold in %s state, loading_cnt = %d, freq_cnt = %d\n",(i == 0) ? "PERF" : "PWR",j,ppm_get_power_state_name(cur_state),data->transition_data[j].loading_hold_cnt,data->transition_data[j].freq_hold_cnt);
#endif
#endif}}}end:ppm_unlock(&hica_policy.lock);FUNC_EXIT(FUNC_LV_HICA);
}

关于计算state的函数和阈值定义在表中,除了heavy_task和big_task,基本是计算util/capacity的cpu占用情况:

struct ppm_power_state_data pwr_state_info_SB[NR_PPM_POWER_STATE] = {[0] = {.name = __stringify(LL_ONLY),.state = PPM_POWER_STATE_LL_ONLY,PWR_STATE_INFO(LL_ONLY, SB)},[1] = {.name = __stringify(L_ONLY),.state = PPM_POWER_STATE_L_ONLY,PWR_STATE_INFO(L_ONLY, SB)},[2] = {.name = __stringify(ALL),.state = PPM_POWER_STATE_ALL,PWR_STATE_INFO(ALL, SB)},
};static struct ppm_state_transfer state_pwr_transfer_ALL[] = {TRANS_DATA(LL_ONLY,PPM_MODE_MASK_ALL_MODE,ppm_trans_rule_ALL_to_LL_ONLY,PPM_DEFAULT_HOLD_TIME,PPM_CAPACITY_DOWN,PPM_DEFAULT_BIGTSK_TIME,0,0,0),TRANS_DATA(L_ONLY,PPM_MODE_MASK_ALL_MODE,ppm_trans_rule_ALL_to_L_ONLY,PPM_DEFAULT_HOLD_TIME,PPM_CAPACITY_DOWN,PPM_DEFAULT_BIGTSK_TIME,2,4,0),
};
STATE_TRANSFER_DATA_PWR(ALL);static struct ppm_state_transfer state_perf_transfer_ALL[] = {TRANS_DATA(NONE, 0, NULL, 0, 0, 0, 0, 0, 0),
};
STATE_TRANSFER_DATA_PERF(ALL);/* 举例:当前state为ALL尝试从power的角度从ALL切换到LL_ONLY:ppm_trans_rule_ALL_to_LL_ONLY()尝试从power的角度从ALL切换到L_ONLY:ppm_trans_rule_ALL_to_L_ONLY()*/
static bool ppm_trans_rule_ALL_to_LL_ONLY(struct ppm_hica_algo_data data, struct ppm_state_transfer *settings)
{/* keep in ALL state if root cluster is fixed at L or B */if (ppm_main_info.fixed_root_cluster == PPM_CLUSTER_L|| ppm_main_info.fixed_root_cluster == PPM_CLUSTER_B)return false;/* (1) 从heavy task负载判断是否需要切换模式 */
#if PPM_HEAVY_TASK_INDICATE_SUPPORT{unsigned int heavy_task, i;for_each_ppm_clusters(i) {heavy_task = hps_get_hvytsk(i);if (heavy_task) {ppm_dbg(HICA, "Stay in ALL due to cluster%d heavy task = %d\n",i, heavy_task);trace_ppm_hica(ppm_get_power_state_name(PPM_POWER_STATE_ALL),ppm_get_power_state_name(PPM_POWER_STATE_LL_ONLY),-1, -1, -1, -1, heavy_task, -1, false);settings->capacity_hold_cnt = 0;return false;}}}
#endif/* (2) 从big task负载判断是否需要切换模式 */
#if PPM_BIG_TASK_INDICATE_SUPPORT{unsigned int big_task_L = hps_get_bigtsk(PPM_CLUSTER_L);unsigned int big_task_B = hps_get_bigtsk(PPM_CLUSTER_B);if (big_task_L || big_task_B) {ppm_dbg(HICA, "Stay in ALL due to L/B big task = %d/%d\n",big_task_L, big_task_B);trace_ppm_hica(ppm_get_power_state_name(PPM_POWER_STATE_ALL),ppm_get_power_state_name(PPM_POWER_STATE_LL_ONLY),-1, -1, big_task_L, big_task_B, -1, -1, false);settings->capacity_hold_cnt = 0;return false;}}
#endif/* (3) 从util/capacity负载判断是否需要切换模式 */{/* check capacity */unsigned long usage, usage_total = 0, capacity = 0, dummy;unsigned int i;for_each_ppm_clusters(i) {if (sched_get_cluster_util(i, &usage, &dummy)) {ppm_err("Get cluster %d util failed\n", i);return false;}usage_total += usage;if (i == PPM_CLUSTER_LL)capacity = dummy;}ppm_dbg(HICA, "usage_total = %ld, LL capacity = %ld\n", usage_total, capacity);/* (3.1) (util/capacity)超过门限值(settings->capacity_bond) 是否达到次数settings->capacity_hold_time,如果条件满足进行state切换*/if (usage_total < capacity * settings->capacity_bond / 100) {settings->capacity_hold_cnt++;if (settings->capacity_hold_cnt >= settings->capacity_hold_time) {trace_ppm_hica(ppm_get_power_state_name(PPM_POWER_STATE_ALL),ppm_get_power_state_name(PPM_POWER_STATE_LL_ONLY),usage_total, capacity, -1, -1, -1, -1, true);return true;}} elsesettings->capacity_hold_cnt = 0;trace_ppm_hica(ppm_get_power_state_name(PPM_POWER_STATE_ALL),ppm_get_power_state_name(PPM_POWER_STATE_LL_ONLY),usage_total, capacity, -1, -1, -1, -1, false);}return false;
}

新的state计算完成后,是通过以下通道配置下去的:

_hps_task_main() -> mt_ppm_main() -> ppm_hica_update_limit_cb() -> ppm_hica_set_default_limit_by_state()↓void ppm_hica_set_default_limit_by_state(enum ppm_power_state state,struct ppm_policy_data *policy)
{unsigned int i;struct ppm_power_state_data *state_info = ppm_get_power_state_info();FUNC_ENTER(FUNC_LV_HICA);for (i = 0; i < policy->req.cluster_num; i++) {if (state >= PPM_POWER_STATE_NONE) {if (state > NR_PPM_POWER_STATE)ppm_err("@%s: Invalid PPM state(%d)\n", __func__, state);policy->req.limit[i].min_cpu_core = get_cluster_min_cpu_core(i);policy->req.limit[i].max_cpu_core = get_cluster_max_cpu_core(i);policy->req.limit[i].min_cpufreq_idx = get_cluster_min_cpufreq_idx(i);policy->req.limit[i].max_cpufreq_idx = get_cluster_max_cpufreq_idx(i);#ifdef PPM_DISABLE_CLUSTER_MIGRATION/* keep at least 1 LL */if (i == 0)policy->req.limit[i].min_cpu_core = 1;
#endif/* (1) HICA根据新的state,配置对应的min_cpu_core/max_cpu_core到本policy当中 */} else {policy->req.limit[i].min_cpu_core =state_info[state].cluster_limit->state_limit[i].min_cpu_core;policy->req.limit[i].max_cpu_core =state_info[state].cluster_limit->state_limit[i].max_cpu_core;policy->req.limit[i].min_cpufreq_idx =state_info[state].cluster_limit->state_limit[i].min_cpufreq_idx;policy->req.limit[i].max_cpufreq_idx =state_info[state].cluster_limit->state_limit[i].max_cpufreq_idx;}}#ifdef PPM_IC_SEGMENT_CHECK/* ignore HICA min freq setting for L cluster in L_ONLY state */if (state == PPM_POWER_STATE_L_ONLY && ppm_main_info.fix_state_by_segment == PPM_POWER_STATE_L_ONLY)policy->req.limit[1].min_cpufreq_idx = get_cluster_min_cpufreq_idx(1);
#endifFUNC_EXIT(FUNC_LV_HICA);
}/*==============================================================*/
/* Local Variables                      */
/*==============================================================*/
/* cluster limit for each power state */
static const struct ppm_cluster_limit state_limit_LL_ONLY[] = {[0] = LIMIT(15, 0, 1, 4),[1] = LIMIT(15, 0, 0, 0),[2] = LIMIT(15, 0, 0, 0),
};
STATE_LIMIT(LL_ONLY);static const struct ppm_cluster_limit state_limit_L_ONLY[] = {[0] = LIMIT(15, 0, 0, 0),[1] = LIMIT(8, 0, 1, 4),[2] = LIMIT(15, 0, 0, 0),
};
STATE_LIMIT(L_ONLY);static const struct ppm_cluster_limit state_limit_ALL[] = {[0] = LIMIT(15, 0, 0, 4),[1] = LIMIT(15, 0, 0, 4),[2] = LIMIT(15, 0, 0, 2),
};
STATE_LIMIT(ALL);_hps_task_main() -> mt_ppm_main() -> ppm_limit_callback()↓static void ppm_limit_callback(struct ppm_client_req req)
{struct ppm_client_req *p = (struct ppm_client_req *)&req;int i;/* (2) 将HICA state对应的policy配置到hps限制中hps_sys.cluster_info[i].ref_base_value/ref_limit_value */mutex_lock(&hps_ctxt.para_lock);hps_sys.ppm_root_cluster = p->root_cluster;for (i = 0; i < p->cluster_num; i++) {/** hps_warn("ppm_limit_callback -> cluster%d: has_advise_core = %d, [%d, %d]\n",*  i, p->cpu_limit[i].has_advise_core,*  p->cpu_limit[i].min_cpu_core, p->cpu_limit[i].max_cpu_core);*/
#ifdef _TRACE_trace_ppm_limit_callback_update(i, p->cpu_limit[i].has_advise_core,p->cpu_limit[i].min_cpu_core, p->cpu_limit[i].max_cpu_core);
#endifif (!p->cpu_limit[i].has_advise_core) {hps_sys.cluster_info[i].ref_base_value = p->cpu_limit[i].min_cpu_core;hps_sys.cluster_info[i].ref_limit_value = p->cpu_limit[i].max_cpu_core;} else {hps_sys.cluster_info[i].ref_base_value =hps_sys.cluster_info[i].ref_limit_value =p->cpu_limit[i].advise_cpu_core;}}mutex_unlock(&hps_ctxt.para_lock);hps_ctxt.is_interrupt = 1;hps_task_wakeup_nolock();}

4.4.2.2、hps_algo_main

_hps_task_main() -> hps_algo_main()↓void hps_algo_main(void)
{unsigned int i, val, base_val, action_print, origin_root, action_break;char str_online[64], str_ref_limit[64], str_ref_base[64], str_criteria_limit[64],str_criteria_base[64], str_target[64], str_hvytsk[64], str_pwrseq[64], str_bigtsk[64];char *online_ptr = str_online;char *criteria_limit_ptr = str_criteria_limit;char *criteria_base_ptr = str_criteria_base;char *ref_limit_ptr = str_ref_limit;char *ref_base_ptr = str_ref_base;char *hvytsk_ptr = str_hvytsk;char *target_ptr = str_target;char *pwrseq_ptr = str_pwrseq;char *bigtsk_ptr = str_bigtsk;static unsigned int hrtbt_dbg;
#ifdef CONFIG_MEIZU_BSPstatic unsigned long int j;
#endif //CONFIG_MEIZU_BSP
#ifdef CONFIG_MTK_ICCS_SUPPORTunsigned char real_online_power_state_bitmask = 0;unsigned char real_target_power_state_bitmask = 0;unsigned char iccs_online_power_state_bitmask = 0;unsigned char iccs_target_power_state_bitmask = iccs_get_target_power_state_bitmask();unsigned char target_cache_shared_state_bitmask = 0;
#endif/* Initial value */base_val = action_print = action_break = hps_sys.total_online_cores = 0;hps_sys.up_load_avg = hps_sys.down_load_avg = hps_sys.tlp_avg = hps_sys.rush_cnt = 0;hps_sys.action_id = origin_root = 0;/** run algo or not by hps_ctxt.enabled*/if ((u64) ktime_to_ms(ktime_sub(ktime_get(), hps_ctxt.hps_hrt_ktime)) >= HPS_HRT_DBG_MS)action_print = hrtbt_dbg = 1;elsehrtbt_dbg = 0;mutex_lock(&hps_ctxt.lock);hps_ctxt.action = ACTION_NONE;atomic_set(&hps_ctxt.is_ondemand, 0);if (!hps_ctxt.enabled)goto HPS_END;if (hps_ctxt.eas_indicator) {/*Set cpu cores by scheduler*/goto HPS_ALGO_END;}/** algo - begin*//*Back up limit and base value for check */mutex_lock(&hps_ctxt.para_lock);if ((hps_sys.cluster_info[0].base_value == 0) &&(hps_sys.cluster_info[1].base_value == 0) &&(hps_sys.cluster_info[2].base_value == 0) &&(hps_sys.cluster_info[0].limit_value == 0) &&(hps_sys.cluster_info[1].limit_value == 0) &&(hps_sys.cluster_info[2].limit_value == 0)) {hps_sys.cluster_info[0].base_value = hps_sys.cluster_info[0].ref_base_value = 0;hps_sys.cluster_info[1].base_value = hps_sys.cluster_info[1].ref_base_value = 0;hps_sys.cluster_info[2].base_value = hps_sys.cluster_info[2].ref_base_value = 0;hps_sys.cluster_info[0].limit_value = hps_sys.cluster_info[0].ref_limit_value = 4;hps_sys.cluster_info[1].limit_value = hps_sys.cluster_info[1].ref_limit_value = 4;hps_sys.cluster_info[2].limit_value = hps_sys.cluster_info[2].ref_limit_value = 0;}for (i = 0; i < hps_sys.cluster_num; i++) {hps_sys.cluster_info[i].base_value = hps_sys.cluster_info[i].ref_base_value;hps_sys.cluster_info[i].limit_value = hps_sys.cluster_info[i].ref_limit_value;}for (i = 0; i < hps_sys.cluster_num; i++) {base_val += hps_sys.cluster_info[i].base_value;hps_sys.cluster_info[i].target_core_num = hps_sys.cluster_info[i].online_core_num =0;hps_sys.cluster_info[i].online_core_num =hps_get_cluster_cpus(hps_sys.cluster_info[i].cluster_id);hps_sys.total_online_cores += hps_sys.cluster_info[i].online_core_num;}mutex_unlock(&hps_ctxt.para_lock);/* Determine root cluster */origin_root = hps_sys.root_cluster_id;hps_define_root_cluster(&hps_sys);
#ifdef CONFIG_MACH_MT6799if (hps_ctxt.smart_det_enabled) {mutex_lock(&hps_ctxt.para_lock);hps_sys.root_cluster_id = 1;/*Change root to L cluster when smart detection is enabled*/mutex_unlock(&hps_ctxt.para_lock);}
#endifif (origin_root != hps_sys.root_cluster_id)hps_sys.action_id = HPS_SYS_CHANGE_ROOT;/** update history - tlp*/val = hps_ctxt.tlp_history[hps_ctxt.tlp_history_index];hps_ctxt.tlp_history[hps_ctxt.tlp_history_index] = hps_ctxt.cur_tlp;hps_ctxt.tlp_sum += hps_ctxt.cur_tlp;hps_ctxt.tlp_history_index =(hps_ctxt.tlp_history_index + 1 ==hps_ctxt.tlp_times) ? 0 : hps_ctxt.tlp_history_index + 1;++hps_ctxt.tlp_count;if (hps_ctxt.tlp_count > hps_ctxt.tlp_times) {WARN_ON(hps_ctxt.tlp_sum < val);hps_ctxt.tlp_sum -= val;hps_ctxt.tlp_avg = hps_ctxt.tlp_sum / hps_ctxt.tlp_times;} else {hps_ctxt.tlp_avg = hps_ctxt.tlp_sum / hps_ctxt.tlp_count;}if (hps_ctxt.stats_dump_enabled)hps_ctxt_print_algo_stats_tlp(0);/*Determine eas enabled or not*/if (!hps_ctxt.eas_enabled)hps_sys.hps_sys_ops[2].enabled = 0;for (i = 0 ; i < hps_sys.cluster_num ; i++)hps_sys.cluster_info[i].target_core_num = hps_sys.cluster_info[i].online_core_num;/* (1) 逐个调用 hps_sys_ops()根据各种算法来判断当前cpu是否需要hotplug */for (i = 0; i < hps_sys.func_num; i++) {if (hps_sys.hps_sys_ops[i].enabled == 1) {if (hps_sys.hps_sys_ops[i].hps_sys_func_ptr()) {hps_sys.action_id = hps_sys.hps_sys_ops[i].func_id;break;}}}
/*if (hps_ctxt.heavy_task_enabled)if (hps_algo_heavytsk_det())hps_sys.action_id = 0xE1;
*/if (hps_ctxt.big_task_enabled)if (hps_algo_big_task_det())hps_sys.action_id = 0xE2;if (hps_sys.action_id == 0)goto HPS_END;HPS_ALGO_END:#ifdef CONFIG_MACH_MT6799if (hps_ctxt.smart_det_enabled) {if (hps_sys.cluster_info[2].bigTsk_value <= 1) {mutex_lock(&hps_ctxt.para_lock);hps_sys.cluster_info[2].target_core_num = 1;mutex_unlock(&hps_ctxt.para_lock);}}
#endif/** algo - end*//* (2) 对limit进行判断,HICA的值就配置到这里 *//*Base and limit check */hps_check_base_limit(&hps_sys);/* Ensure that root cluster must one online cpu at less */if (hps_sys.cluster_info[hps_sys.root_cluster_id].target_core_num <= 0)hps_sys.cluster_info[hps_sys.root_cluster_id].target_core_num = 1;#ifdef CONFIG_MTK_ICCS_SUPPORTreal_online_power_state_bitmask = 0;real_target_power_state_bitmask = 0;for (i = 0; i < hps_sys.cluster_num; i++) {real_online_power_state_bitmask |= ((hps_sys.cluster_info[i].online_core_num > 0) << i);real_target_power_state_bitmask |= ((hps_sys.cluster_info[i].target_core_num > 0) << i);}iccs_online_power_state_bitmask = iccs_target_power_state_bitmask;iccs_target_power_state_bitmask = real_target_power_state_bitmask;iccs_get_target_state(&iccs_target_power_state_bitmask, &target_cache_shared_state_bitmask);/** pr_err("[%s] iccs_target_power_state_bitmask: 0x%x\n", __func__, iccs_target_power_state_bitmask);*/for (i = 0; i < hps_sys.cluster_num; i++) {hps_sys.cluster_info[i].iccs_state = (((real_online_power_state_bitmask >> i) & 1) << 3) |(((real_target_power_state_bitmask >> i) & 1) << 2) |(((iccs_online_power_state_bitmask >> i) & 1) << 1) |(((iccs_target_power_state_bitmask >> i) & 1) << 0);/** pr_err("[%s] cluster: 0x%x iccs_state: 0x%x\n", __func__, i, hps_sys.cluster_info[i].iccs_state);*/if (hps_get_iccs_pwr_status(i) == 0x1)iccs_cluster_on_off(i, 1);else if (hps_get_iccs_pwr_status(i) == 0x2)iccs_cluster_on_off(i, 0);}
#endif/* (3) 经过各种算法计算后目标值是target_core_num,而当前值是online_core_num;如果不一致,进行cpu_up()/cpu_down()操作*/
#if 1               /*Make sure that priority of power on action is higher than power down. */for (i = 0; i < hps_sys.cluster_num; i++) {if (hps_sys.cluster_info[i].target_core_num >hps_sys.cluster_info[i].online_core_num) {if (hps_algo_do_cluster_action(i) == 1) {action_print = action_break = 1;break;}action_print = 1;}}if (!action_break) {for (i = 0; i < hps_sys.cluster_num; i++) {if (hps_sys.cluster_info[i].target_core_num <hps_sys.cluster_info[i].online_core_num) {if (hps_algo_do_cluster_action(i) == 1) {action_print = action_break = 1;break;}action_print = 1;}}}
#else/*Process root cluster first */if (hps_sys.cluster_info[hps_sys.root_cluster_id].target_core_num !=hps_sys.cluster_info[hps_sys.root_cluster_id].online_core_num) {if (hps_algo_do_cluster_action(hps_sys.root_cluster_id) == 1)action_break = 1;elseaction_break = 0;action_print = 1;}for (i = 0; i < hps_sys.cluster_num; i++) {if (i == hps_sys.root_cluster_id)continue;if (hps_sys.cluster_info[i].target_core_num !=hps_sys.cluster_info[i].online_core_num) {if (hps_algo_do_cluster_action(i) == 1)action_break = 1;elseaction_break = 0;action_print = 1;}}#endif
#ifdef CONFIG_MTK_ICCS_SUPPORTfor (i = 0; i < hps_sys.cluster_num; i++) {if (hps_get_cluster_cpus(hps_sys.cluster_info[i].cluster_id) !=hps_sys.cluster_info[i].target_core_num) {if (hps_get_cluster_cpus(hps_sys.cluster_info[i].cluster_id) == 0)iccs_target_power_state_bitmask &= ~(1 << i);else if (hps_sys.cluster_info[i].target_core_num == 0)iccs_target_power_state_bitmask |= (1 << i);}}/** pr_err("[%s] iccs_target_power_state_bitmask: 0x%x\n", __func__, iccs_target_power_state_bitmask);*/iccs_set_target_power_state_bitmask(iccs_target_power_state_bitmask);
#endif
HPS_END:if (action_print || hrtbt_dbg) {int online, target, ref_limit, ref_base, criteria_limit, criteria_base, hvytsk, pwrseq, bigtsk;mutex_lock(&hps_ctxt.para_lock);online = target = criteria_limit = criteria_base = 0;for (i = 0; i < hps_sys.cluster_num; i++) {if (i == origin_root)online =sprintf(online_ptr, "<%d>",hps_sys.cluster_info[i].online_core_num);elseonline =sprintf(online_ptr, "(%d)",hps_sys.cluster_info[i].online_core_num);if (i == hps_sys.root_cluster_id)target =sprintf(target_ptr, "<%d>",hps_sys.cluster_info[i].target_core_num);elsetarget =sprintf(target_ptr, "(%d)",hps_sys.cluster_info[i].target_core_num);criteria_limit =sprintf(criteria_limit_ptr, "(%d)",hps_sys.cluster_info[i].limit_value);criteria_base =sprintf(criteria_base_ptr, "(%d)", hps_sys.cluster_info[i].base_value);ref_limit =sprintf(ref_limit_ptr, "(%d)", hps_sys.cluster_info[i].ref_limit_value);ref_base =sprintf(ref_base_ptr, "(%d)", hps_sys.cluster_info[i].ref_base_value);hvytsk = sprintf(hvytsk_ptr, "(%d)", hps_sys.cluster_info[i].hvyTsk_value);bigtsk = sprintf(bigtsk_ptr, "(%d)", hps_sys.cluster_info[i].bigTsk_value);if (i == 0)pwrseq = sprintf(pwrseq_ptr, "(%d->", hps_sys.cluster_info[i].pwr_seq);else if ((i != 0) && (i != (hps_sys.cluster_num - 1)))pwrseq = sprintf(pwrseq_ptr, "%d->", hps_sys.cluster_info[i].pwr_seq);else if (i == (hps_sys.cluster_num - 1))pwrseq = sprintf(pwrseq_ptr, "%d) ", hps_sys.cluster_info[i].pwr_seq);online_ptr += online;target_ptr += target;criteria_limit_ptr += criteria_limit;criteria_base_ptr += criteria_base;ref_limit_ptr += ref_limit;ref_base_ptr += ref_base;hvytsk_ptr += hvytsk;bigtsk_ptr += bigtsk;pwrseq_ptr += pwrseq;}mutex_unlock(&hps_ctxt.para_lock);if (action_print) {hps_set_funct_ctrl();if (action_break)hps_warn("(0x%X)%s action break!! (%u)(%u)(%u) %s %s%s-->%s%s (%u)(%u)(%u)(%u) %s\n",((hps_ctxt.hps_func_control << 12) | hps_sys.action_id),str_online, hps_ctxt.cur_loads,hps_ctxt.cur_tlp, hps_ctxt.cur_iowait, str_hvytsk,str_criteria_limit, str_criteria_base,str_ref_limit, str_ref_base,hps_sys.up_load_avg,hps_sys.down_load_avg, hps_sys.tlp_avg, hps_sys.rush_cnt,str_target);else {char str1[256];char str2[256];snprintf(str1, sizeof(str1),"(0x%X)%s action end (%u)(%u)(%u) %s %s[%u][%u](%u) %s %s%s (%u)(%u)(%u)(%u)",((hps_ctxt.hps_func_control << 12) | hps_sys.action_id),str_online, hps_ctxt.cur_loads,hps_ctxt.cur_tlp, hps_ctxt.cur_iowait,str_hvytsk, str_bigtsk, hps_ctxt.is_screen_off,hps_ctxt.is_idle, hps_ctxt.idle_ratio,str_pwrseq, str_criteria_limit, str_criteria_base,hps_sys.up_load_avg,hps_sys.down_load_avg,hps_sys.tlp_avg, hps_sys.rush_cnt);snprintf(str2, sizeof(str2),"[%u,%u|%u,%u|%u,%u][%u,%u,%u] [%u,%u,%u] [%u,%u,%u] [%u,%u,%u] %s",hps_sys.cluster_info[0].up_threshold,hps_sys.cluster_info[0].down_threshold,hps_sys.cluster_info[1].up_threshold,hps_sys.cluster_info[1].down_threshold,hps_sys.cluster_info[2].up_threshold,hps_sys.cluster_info[2].down_threshold,hps_sys.cluster_info[0].loading,hps_sys.cluster_info[1].loading,hps_sys.cluster_info[2].loading,hps_sys.cluster_info[0].rel_load,hps_sys.cluster_info[1].rel_load,hps_sys.cluster_info[2].rel_load,hps_sys.cluster_info[0].abs_load,hps_sys.cluster_info[1].abs_load,hps_sys.cluster_info[2].abs_load,/* sched-assist hotplug: for debug */hps_sys.cluster_info[0].sched_load,hps_sys.cluster_info[1].sched_load,hps_sys.cluster_info[2].sched_load,str_target);
#ifdef CONFIG_MEIZU_BSPif (printk_timed_ratelimit(&j, 500))hps_warn("%s%s\n", str1, str2);
#elsehps_warn("%s%s\n", str1, str2);
#endif //CONFIG_MEIZU_BSP
#ifdef _TRACE_trace_hps_update(hps_sys.action_id, str_online, hps_ctxt.cur_loads,hps_ctxt.cur_tlp, hps_ctxt.cur_iowait, str_hvytsk,str_criteria_limit, str_criteria_base,hps_sys.up_load_avg, hps_sys.down_load_avg,hps_sys.tlp_avg,hps_sys.rush_hps_sys.cluster_info[0].up_threshold,hps_sys.cluster_info[0].down_threshold,hps_sys.cluster_info[0].up_threshold,hps_sys.cluster_info[0].down_threshold,hps_sys.cluster_info[2].up_threshold,hps_sys.cluster_info[2].down_threshold,hps_sys.cluster_info[0].loading, hps_sys.cluster_info[1].loading,hps_sys.cluster_info[2].loading,hps_ctxt.up_times, hps_ctxt.down_times, str_target);
#endif}hps_ctxt_reset_stas_nolock();}}
#if HPS_HRT_BT_ENif (hrtbt_dbg && (action_print)) {hps_set_funct_ctrl();hps_warn("(0x%X)%s HRT_BT_DBG (%u)(%u)(%u) %s %s %s %s%s (%u)(%u)(%u)(%u) %s\n",((hps_ctxt.hps_func_control << 12) | hps_sys.action_id),str_online, hps_ctxt.cur_loads, hps_ctxt.cur_tlp,hps_ctxt.cur_iowait, str_hvytsk, str_bigtsk, str_pwrseq, str_criteria_limit,str_criteria_base, hps_sys.up_load_avg, hps_sys.down_load_avg,hps_sys.tlp_avg, hps_sys.rush_cnt, str_target);hrtbt_dbg = 0;hps_ctxt.hps_hrt_ktime = ktime_get();}
#endifaction_print = 0;action_break = 0;mutex_unlock(&hps_ctxt.lock);
}

当前hps_algo_main()的算法对应有几种:

static int (*hps_func[]) (void) = {
/*hps_algo_perf_indicator, hps_algo_rush_boost, hps_algo_eas, hps_algo_up, hps_algo_down};*/
hps_algo_perf_indicator, hps_algo_rush_boost, hps_algo_eas};/* (1) 取perf规定的最小值 */
static int hps_algo_perf_indicator(void)
{unsigned int i;if (atomic_read(&hps_ctxt.is_ondemand) != 0) { /* for ondemand request */atomic_set(&hps_ctxt.is_ondemand, 0);mutex_lock(&hps_ctxt.para_lock);for (i = 0; i < hps_sys.cluster_num; i++)hps_sys.cluster_info[i].target_core_num =max(hps_sys.cluster_info[i].base_value, hps_sys.cluster_info[i].online_core_num);mutex_unlock(&hps_ctxt.para_lock);return 1;}return 0;
}/* (2) 根据当前load的值是否达到boost门限,来决定是否启动boost */
static int hps_algo_rush_boost(void)
{int val, base_val;unsigned int idx, total_rel_load;idx = total_rel_load = 0;for (idx = 0 ; idx < hps_sys.cluster_num ; idx++)total_rel_load += hps_sys.cluster_info[idx].rel_load;if (!hps_ctxt.rush_boost_enabled)return 0;base_val = cal_base_cores();if (total_rel_load > hps_ctxt.rush_boost_threshold * hps_sys.total_online_cores)++hps_ctxt.rush_count;elsehps_ctxt.rush_count = 0;if (hps_ctxt.rush_boost_times == 1)hps_ctxt.tlp_avg = hps_ctxt.cur_tlp;if ((hps_ctxt.rush_count >= hps_ctxt.rush_boost_times) &&(hps_sys.total_online_cores * 100 < hps_ctxt.tlp_avg)) {val = hps_ctxt.tlp_avg / 100 + (hps_ctxt.tlp_avg % 100 ? 1 : 0);WARN_ON(!(val > hps_sys.total_online_cores));if (val > num_possible_cpus())val = num_possible_cpus();if (val > base_val)val -= base_val;elseval = 0;hps_sys.tlp_avg = hps_ctxt.tlp_avg;hps_sys.rush_cnt = hps_ctxt.rush_count;hps_cal_core_num(&hps_sys, val, base_val);/* [MET] debug for geekbench */met_tag_oneshot(0, "sched_rush_boost", 1);return 1;} else {/* [MET] debug for geekbench */met_tag_oneshot(0, "sched_rush_boost", 0);return 0;}
}/* (3) 根据负载来计算需要的online cpu */
static int hps_algo_eas(void)
{int val, ret, i;ret = 0;for (i = 0 ; i < hps_sys.cluster_num ; i++) {hps_sys.cluster_info[i].target_core_num = hps_sys.cluster_info[i].online_core_num;/*if up_threshold > loading > down_threshold ==> No action*/if ((hps_sys.cluster_info[i].loading <(hps_sys.cluster_info[i].up_threshold*hps_sys.cluster_info[i].online_core_num)) &&(hps_sys.cluster_info[i].loading >(hps_sys.cluster_info[i].down_threshold*hps_sys.cluster_info[i].online_core_num)))continue;/*if loading > up_threshod ==> power on cores*/if ((hps_sys.cluster_info[i].loading >(hps_sys.cluster_info[i].up_threshold*hps_sys.cluster_info[i].online_core_num))) {val = hps_sys.cluster_info[i].loading / hps_sys.cluster_info[i].up_threshold;if (hps_sys.cluster_info[i].loading % hps_sys.cluster_info[i].up_threshold)val++;if (val <= hps_sys.cluster_info[i].limit_value)hps_sys.cluster_info[i].target_core_num = val;elsehps_sys.cluster_info[i].target_core_num = hps_sys.cluster_info[i].limit_value;ret = 1;} else if ((hps_sys.cluster_info[i].loading <(hps_sys.cluster_info[i].down_threshold*hps_sys.cluster_info[i].online_core_num))) {/*if loading < down_threshod ==> power off cores*/if (!hps_sys.cluster_info[i].loading) {hps_sys.cluster_info[i].target_core_num = 0;continue;}val = hps_sys.cluster_info[i].loading / hps_sys.cluster_info[i].down_threshold;if (hps_sys.cluster_info[i].loading % hps_sys.cluster_info[i].down_threshold)val++;if (val >= hps_sys.cluster_info[i].base_value)hps_sys.cluster_info[i].target_core_num = val;elsehps_sys.cluster_info[i].target_core_num = hps_sys.cluster_info[i].base_value;ret = 1;}}#if 0/*Check with big task criteriai*/for (i = 1 ; i < hps_sys.cluster_num ; i++) {if ((!hps_sys.cluster_info[i].bigTsk_value) &&(!(hps_sys.cluster_info[i].loading / hps_sys.cluster_info[i].down_threshold)))hps_sys.cluster_info[i].target_core_num = 0;}
#endifreturn ret;
}

4.5、NUMA负载均衡

NUMA arm架构没有使用,暂时不去解析。

Linux schedule 4、负载均衡相关推荐

  1. 使用LVS(Linux Virtual Server)在Linux上搭建负载均衡的集群服务

    使用LVS(Linux Virtual Server)在Linux上搭建负载均衡的集群服务 一.基于于NAT的LVS的安装与配置. 1. 硬件需求和网络拓扑                       ...

  2. 全面讲述linux集群负载均衡

    学习linux时,你可能会遇到linux集群的问题,这里将介绍linux集群负载均衡的方法,经过仔细整理,在这里拿出来和大家分享一下,希望本文能教会你更多东西. 集群原理 linux集群系统包括集群节 ...

  3. linux双网卡负载均衡,四个步骤完成双网卡负载均衡

    在linux下实现负载均衡我们已经对它的配置说过很多了,现在我们介绍的是关于在这个系统下的双网卡负载均衡的设定过程,总的可以分为四个步骤,首先我们要对虚拟网络接口文件进行改动,然后是对网卡的信息文件的 ...

  4. linux内核SMP负载均衡浅析

    需求       在<linux进程调度浅析>一文中提到,在SMP(对称多处理器)环境下,每个CPU对应一个run_queue(可执行队列).如果一个进程处于TASK_RUNNING状态( ...

  5. linux查看服务器负载均衡,Linux服务器负载均衡LB

    一 基础知识: 1 群集种类: 1.1 LB 负载均衡:解决大量的并发的连接请求: scale on(单机处理能力的增强) scale out 1.2 HA 高可用性群集:应用于企业网中高可用性体系, ...

  6. Nginx系列篇二:linux搭建Nginx负载均衡

    建议先搭建好Nginx环境 可阅读--->Linux中搭建Nginx 1.准备好三台服务器[标配] 一.nginx负载均衡服务器:192.168.102.110,配置好Nginx 二.tomca ...

  7. Go 如何利用 Linux 内核的负载均衡能力?

    在测试 HTTP 服务时,如果该进程我们忘记关闭,而重新尝试启动一个新的服务进程,那么将会遇到类似以下的错误信息: $ go run main.go listen tcp :8000: bind: a ...

  8. linux 下实现负载均衡群集(NAT方式)

    高可用性群集:以提高应用系统的可靠性,尽可能地减少中断时间为目标,确保服务的连续性,达到高可用(HA)的容错效果.HA的工作方式包括双工.主从两种模式------双工即所有节点同时在线,主从则只有主节 ...

  9. Go 如何利用 Linux 内核的负载均衡能力

    在测试 HTTP 服务时,如果该进程我们忘记关闭,而重新尝试启动一个新的服务进程,那么将会遇到类似以下的错误信息: $ go run main.go listen tcp :8000: bind: a ...

  10. Linux高可用负载均衡 集群理解

    高可用集群HA的实现方式Heartbeat 当然还有其它如Keeplive 负载均衡群集LB的实现方式. (1).硬件LB(比较出名的)F5;BIG-IP系列.Citri;公司的 NetScaler系 ...

最新文章

  1. linux硬盘拷贝系统,教你在第二块硬盘上拷贝安装Linux系统
  2. Java lambda例子
  3. c语言字符串的一个简单例子,把一个字符串中的小写字母改成大写字母
  4. 能上架App的GooglePlay开发者账号获取流程
  5. java jsf_将Java 8日期时间API与JSF和Java EE 7结合使用
  6. shiro学习(22):动态添加验证规则2
  7. HDOJ1106 排序
  8. Python:创建列表,其中包含数字1-1000000,为什么Pycharm控制台结果显示不完整?
  9. AS3编码规范(转)
  10. codeforces 379F-New Year Tree
  11. 计算机病毒小学教师资格证面试,小学信息技术人教版四年级上册第15课《病毒防治及时做》优质课公开课教案教师资格证面试试讲教案...
  12. 滴滴拼车更名“青菜拼车” 或将独立运营
  13. 两台无线路由桥接(WDS)的简单方法(TP-LINK841N,当AP用)
  14. 基于visual Studio2013解决C语言竞赛题之1044数组处理
  15. 【分享】精通并发与Netty教程
  16. windows补丁修复
  17. STM32F411 Discovery学习笔记(二)点亮你的第一个LED
  18. [FMG]ADT-eclipse升级为可以添加javaWeb
  19. 新网(万网)账户间域名的转移
  20. Win10 右键新建没有WORD文档创建菜单?

热门文章

  1. 基于I2C/SPI的温湿度采集与OLED显示
  2. Unity3D 插件 Mesh To Terrain的使用方法
  3. 为什么说指针是 C 语言的精髓
  4. Win10应用右下角小盾牌怎么解决?
  5. MongoDB中不溜教程(1)简介与命令
  6. nyoj 145 聪明的小珂
  7. JDK11安装完成后没有jre目录
  8. 计算机网络:以太网中的MTU与MSS
  9. matlab nan 无色_MATLAB中出现NAN怎么回事
  10. 41-表单使用场景以及分类