cpuidle子系统之（二）：core层

关注公众号不迷路：DumpStack

扫码加关注

一、数据结构
二、系统中cpuidle_device的组织关系
三、系统中cpuidle_driver的组织关系
四、系统中cpuidle_governor的组织关系
五、挑选C state
六、进入C state
七、sysfs层接口
八、cpuidle_device的使能和禁用
- 8.1 cpuidle_enable_device - 使能一个cpu
- 8.2 cpuidle_disable_device - 禁用一个cpu
九、cpuidle子系统的使能和禁止
十、poll机制（待补充）
十一、coupled机制（待补充）
关注公众号不迷路：DumpStack

cpuidle子系统的core层，主要完成如下工作

为C state、device、driver、governor定义数据结构；
组织和管理device、driver、governor结构；
提供API接口

向上，为idle线程提供接口，这样idle线程就能通过调用这些API接口，方便的完成C state的选核和进入等流程
向下，为device、driver、governor提供注册接口
在sysfs中提供用户空间接口，用户空间可以通过这些节点完成相关信息查询和控制等功能

一、数据结构

1.1 cpuidle_state - 描述一个C state

前面提到过，cpuidle提出的主要背景是，很多复杂的CPU，有多种不同的idle等级（Linux中的术语为C state），这些不同级别的C state有不同的功耗和延迟，从而可以在不同的场景下使用。Linux使用cpuidle_state结构抽象一个C state

struct cpuidle_state {

//name和desc用于保存当前C state的名称的描述

char name[CPUIDLE_NAME_LEN];

char desc[CPUIDLE_DESC_LEN];

//下面exit_latency和target_residency转化为ns，

//为啥一个结构中要保存两个呢，详见c1d51f684c72b提交

u64 exit_latency_ns; //退出延迟

u64 target_residency_ns; //最小滞留时间

//可选的flags标记如下

//CPUIDLE_FLAG_TIME_VALID : 该idle等级下的时间是可测量的

//CPUIDLE_FLAG_COUPLED : 该idle等级会同时在多个cpu上起作用，软件需要特殊处理；

//CPUIDLE_FLAG_POLLING : 表示不是真正意义上的idle状态，而是处于忙等

//CPUIDLE_FLAG_TIMER_STOP : 表面在该C state下，local timer会被关闭

//CPUIDLE_FLAG_UNUSABLE : 这个C state不能用，即unavailable

//CPUIDLE_FLAG_OFF : 该C state默认是被disable掉的

//CPUIDLE_FLAG_TLB_FLUSHED : idle-state flushes TLBs

//CPUIDLE_FLAG_RCU_IDLE : idle-state takes care of RCU

unsigned int flags;

//cpu从该C state中返回到可运行状态所需的延迟，单位us

//它决定了cpu在idle状态和run状态之间切换的效率，如果延迟过大，将会影响系统性能

unsigned int exit_latency; /* in US */

//cpu在该C state下的功耗，单位mW

//实际上，cpuidle子系统在挑选C state的过程中并没有使用到该成员，

//仅在sysfs导出到用户空间的power节点使用，仅起到一个标识的作用

int power_usage; /* in mW */

//切换代价，本文称"最小滞留时间"

//进入和退出idle状态是需要消耗额外的能量的，如果在idle状态停留的时间过短，

//节省的功耗少于额外的消耗，则得不偿失，该字段用于描述：进入该state至少需

//要停留多长时间才是值得的，单位us

//governor根据该字段，结合当前的系统情况，选择合适的C state

unsigned int target_residency; /* in US */

//进入该C state的回调函数

int (*enter) (struct cpuidle_device *dev,

struct cpuidle_driver *drv,

int index);

//cpu长时间不需要工作时（称作offline），可调用该回调函数

int (*enter_dead) (struct cpuidle_device *dev, int index);

* CPUs execute ->enter_s2idle with the local tick or entire timekeeping

* suspended, so it must not re-enable interrupts at any point (even

* temporarily) or attempt to change states of clock event devices.

* This callback may point to the same function as ->enter if all of

* the above requirements are met by it.

//在执行enter_s2idle时，本地tick或整个timekeeping系统都会被suspend起来，

//所以不能在任何点重新使能中断(即使是临时的)或尝试改变时钟事件设备的状态

int (*enter_s2idle)(struct cpuidle_device *dev,

struct cpuidle_driver *drv,

int index);

};

PS：关于cpuidle_state结构中已经有了exit_latency和target_residency，为什么还有再定义exit_latency_ns和target_residency_ns呢？这主要是因为使用ms为单位，在一些整数除法上效率低下，所以引入了ns为单位的成员，该笔修改在c1d51f684c72b中提交，patch如下，可以研究一下

1.2 cpuidle_state_usage - 每个cpu的每个C state都对应一个，记录该cpu在该C state下的统计信息

struct cpuidle_state_usage {

//这个cpu的这个C state是否已经使能，可取值如下，表示被谁禁用了

//#define CPUIDLE_STATE_DISABLED_BY_USER BIT(0)

//#define CPUIDLE_STATE_DISABLED_BY_DRIVER BIT(1)

unsigned long long disable;

//从系统启动到现在，这个cpu进入这个C state的次数

unsigned long long usage;

//从系统启动到现在，这个cpu在这个C state下的总耗时，单位ns

u64 time_ns;

//注意：下面above和below两个变量都是站在性能的角度考虑问题，忽视功耗

// 也就是只考虑了"退出延迟"这个因素，没有考虑到"节约功耗"

// 另外，这两个变量实际只在sysfs中导出节点时使用，内核没有用

//记录系统从启动到现在，下面情况发生的总次数：

//当前C state更加符合彼时系统的需求（站在性能的角度），

//但是实际却偏偏选择了比自己睡眠深的C state

unsigned long long above; /* Number of times it's been too deep */

//记录系统从启动到现在，下面情况发生的总次数：

//当前C state更加符合彼时系统的需求（站在性能的角度），

//但是实际却偏偏选择了比自己睡眠浅的C state

unsigned long long below; /* Number of times it's been too shallow */

//从系统启动到现在，请求进入该C state被拒绝的总次数

unsigned long long rejected; /* Number of times idle entry was rejected */

#ifdef CONFIG_SUSPEND

unsigned long long s2idle_usage;

unsigned long long s2idle_time; /* in US */

#endif

};

1.3 cpuidle_device - 描述一个cpu

每个cpu，都会对应一个cpuidle_device

struct cpuidle_device {

//标记这个device是否已经注册进系统

unsigned int registered:1;

//标记这个device是否已经使能

unsigned int enabled:1;

//标记当前cpu是不是正在执行polling操作

//注意：为false的时候，表示当前cpu正在执行polling操作，详见poll_idle实现

unsigned int poll_time_limit:1;

//该device属于哪个cpu

unsigned int cpu;

//cpu进入idle前设置该值，退出idle时清零，详见cpuidle_enter实现

//该值保存着：cpu睡眠前，距离当前时刻最近那个即将过期的高精度定时器

ktime_t next_hrtimer;

//记录该cpu上一次是在哪个C state上睡眠了

int last_state_idx;

//记录该cpu上一次在C state上停留的时间

u64 last_residency_ns;

//记录这个cpu支持的所有C state中，"最小滞留时间"的最小值

//因为睡眠越深，"最小滞留时间"就越大，因此，该值一般就是

//睡眠最浅的那个C state的"最小滞留时间"

u64 poll_limit_ns;

u64 forced_idle_latency_limit_ns;

//记录该cpu在各个C state下的统计信息，最大记录10个状态

struct cpuidle_state_usage states_usage[CPUIDLE_STATE_MAX];

//sysfs文件系统相关

struct cpuidle_state_kobj *kobjs[CPUIDLE_STATE_MAX];

struct cpuidle_driver_kobj *kobj_driver;

struct cpuidle_device_kobj *kobj_dev;

//系统中的所有cpuidle_device都通过该结构挂在一个全局链表上

struct list_head device_list;

//"耦合状态"相关，详见后面分析

#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED

cpumask_t coupled_cpus;

struct cpuidle_coupled *coupled;

#endif

};

1.4 cpuidle_driver

struct cpuidle_driver {

//driver名称

const char *name;

//driver所属模块

struct module *owner;

/* used by the cpuidle framework to setup the broadcast timer */

//一个标志，用于指示在该driver注册和注销时，是否需要设置一个broadcast timer，

//有关broadcast timer后面再详细介绍

unsigned int bctimer:1;

/* states array must be ordered in decreasing power consumption */

//该driver支持哪些C state

//排的越靠前的C state，睡眠越浅，状态切换的代价越小，（即允许呆在该状态的最

//小时间越短，本文称为"最小滞留时间"），退出时的延迟越短，允许的功耗越高

struct cpuidle_state states[CPUIDLE_STATE_MAX];

//该driver支持几个cpuidle等级

int state_count;

//和coupled cpu idle有关，后面会单独介绍

//该driver支持哪些cpu

int safe_state_index;

/* the driver handles the cpus in cpumask */

//这个driver控制着哪些cpu

struct cpumask *cpumask;

/* preferred governor to switch at register time */

//这个driver所使用的governor

const char *governor;

};

1.5 cpuidle_governor

struct cpuidle_governor {

//governor的名称

char name[CPUIDLE_NAME_LEN];

//系统中的所有governor通过该节点链接到全局链表cpuidle_governors中

struct list_head governor_list;

//一个数值，标记该governor的级别，数值越大优先级越高

//Linux默认会选择系统中rating值最大的governor作为当前governor，除非用于主动修改

unsigned int rating;

//完成governor使能前准备工作，例如私有数据初始化等

int (*enable) (struct cpuidle_driver *drv, struct cpuidle_device *dev);

//完成governor禁用后的销毁工作，例如释放上面enable中申请的空间

void (*disable) (struct cpuidle_driver *drv, struct cpuidle_device *dev);

//选择一个C state

//根据当前系统的运行状况，以及各个C state的特性，选择一个合适的C state

//其中stop_tick用于返回，调用者根据该标记判断进入该C state前是否需要关闭tick

int (*select) (struct cpuidle_driver *drv,

struct cpuidle_device *dev,

bool *stop_tick);

//每次从C state返回时，kernel会调用governor的reflect接口，以便让governor有机会

//考虑这一次state切换的结果，并更新一些统计信息，以便在下一次挑选出更加合适的C state

//对menu而言，它的reflect接口会设置needs_update标志，并在下一次select时，更新状态，

//为什么不在reflect中直接完成状态更新呢？这是因为reflect是在关中断的上下文，不适宜

//做太多的工作，需要尽快完成工作并打开中断，处理中断服务程序，具体行为可参考后面的描述；

void (*reflect) (struct cpuidle_device *dev, int index);

};

二、系统中cpuidle_device的组织关系

cpuidle_device描述一个受cpuidle控制的设备，本文特指cpu，系统中的所有已经注册的cpuidle_device都挂在cpuidle_detected_devices这个全局链表上，每个cpu可以通过percpu变量cpuidle_dev访问自己对应的device

2.1 cpuidle_detected_devices - 系统中的所有cpu全部挂在这个链表上

__cpuidle_register_device向该链表上增加

LIST_HEAD(cpuidle_detected_devices);

2.2 cpuidle_dev - 每个cpu对应一个cpuidle_device结构

DEFINE_PER_CPU(struct cpuidle_device, cpuidle_dev);

2.3 cpuidle_register_device

/**

* cpuidle_register_device - registers a CPU's idle PM feature

* @dev: the cpu

int cpuidle_register_device(struct cpuidle_device *dev)

{

int ret = -EBUSY;

if (!dev)

return -EINVAL;

mutex_lock(&cpuidle_lock);

//1.如果这个device已经被注册过了，则退出

if (dev->registered)

goto out_unlock;

//2.初始化device中的cpuidle_state_usage成员

__cpuidle_device_init(dev);

//3.将cpuidle_device挂入全局链表

ret = __cpuidle_register_device(dev);

if (ret)

goto out_unlock;

//4.在sysfs文件系统中的节点

ret = cpuidle_add_sysfs(dev);

if (ret)

goto out_unregister;

//5.使能cpuidle_device

ret = cpuidle_enable_device(dev);

if (ret)

goto out_sysfs;

//6.所谓的idle handler，其实就是一个内部的全局变量initialized，

// 后面讲到cpuidle的核心功能时，会再说明

cpuidle_install_idle_handler();

out_unlock:

mutex_unlock(&cpuidle_lock);

return ret;

out_sysfs:

cpuidle_remove_sysfs(dev);

out_unregister:

__cpuidle_unregister_device(dev);

goto out_unlock;

}

2.4.1 __cpuidle_device_init - 初始化一个cpuidle_device

static void __cpuidle_device_init(struct cpuidle_device *dev)

{

memset(dev->states_usage, 0, sizeof(dev->states_usage));

dev->last_residency_ns = 0;

dev->next_hrtimer = 0;

}

2.4 __cpuidle_register_device

/**

* __cpuidle_register_device - internal register function called before register

* and enable routines

* @dev: the cpu

* cpuidle_lock mutex must be held before this is called

static int __cpuidle_register_device(struct cpuidle_device *dev)

{

int ret;

//1.获取device对应的driver，device是每个cpu对应一个

struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);

if (!try_module_get(drv->owner))

return -EINVAL;

//2.将dev赋值给percpu变量，并将其挂在全局链表上

per_cpu(cpuidle_devices, dev->cpu) = dev;

list_add(&dev->device_list, &cpuidle_detected_devices);

//3.暂时不对coupled进行分析，函数始终返回0，接着设置registered标记

ret = cpuidle_coupled_register_device(dev);

if (ret)

__cpuidle_unregister_device(dev);

else

dev->registered = 1;

return ret;

}

三、系统中cpuidle_driver的组织关系

cpuidle_driver驱动的对象是cpuidle_device，即CPU，在SMP系统中有多个CPU，也就有多个cpuidle_device，如果这些device的idle功能相同（最关键的是C state的个数、参数相同），那么一个cpuidle_driver就可以驱动这些device，否则，则需要多个driver才能驱动

基于上面的事实，cpuidle子系统提供一个名称为CONFIG_CPU_IDLE_MULTIPLE_DRIVERS的配置项，用于设置是否需要多个cpuidle_driver，如果没有使能这个配置项，说明所有CPU的idle功能相同，一个cpuidle_driver即可，此时cpuidle_driver的注册，就是将driver保存在一个名称为cpuidle_curr_driver的全局指针中；相反，如果使能这个配置项，说明不同CPU的idle功能不同，此时每个CPU都要有一个driver，详细代码实现如下：

#ifdef CONFIG_CPU_IDLE_MULTIPLE_DRIVERS

static DEFINE_PER_CPU(struct cpuidle_driver *, cpuidle_drivers);

#else

static struct cpuidle_driver *cpuidle_curr_driver;

#endif

3.1 cpuidle_get_driver - 获取当前cpu正在使用的driver

/**

* cpuidle_get_driver - return the driver tied to the current CPU.

* Returns a struct cpuidle_driver pointer, or NULL if no driver is registered.

struct cpuidle_driver *cpuidle_get_driver(void)

{

struct cpuidle_driver *drv;

int cpu;

//1.获取当前cpu

cpu = get_cpu();

//2.获取指定cpu的driver

drv = __cpuidle_get_cpu_driver(cpu);

put_cpu();

return drv;

}

其中get_cpu实现如下，为啥要关抢占呢？有知道的大佬麻烦留言告诉我一下

#define get_cpu() ({ preempt_disable(); __smp_processor_id(); })

#define put_cpu() preempt_enable()

3.2 cpuidle_get_cpu_driver - 获得指定cpu使用的cpuidle_driver

从传入的参数dev中可以获得这个device所属的cpu，该函数获得cpu所使用的cpuidle_driver

/**

* cpuidle_get_cpu_driver - return the driver registered for a CPU.

* @dev: a valid pointer to a struct cpuidle_device

* Returns a struct cpuidle_driver pointer, or NULL if no driver is registered

* for the CPU associated with @dev.

struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev)

{

if (!dev)

return NULL;

return __cpuidle_get_cpu_driver(dev->cpu);

}

3.3 __cpuidle_get_cpu_driver - 获得指定cpu使用的driver结构

Linux中关于cpuidle_driver有两种形式，每个cpu可以有自己的driver，也可以所有cpu共用一个driver，通过CONFIG_CPU_IDLE_MULTIPLE_DRIVERS宏可控制

#ifdef CONFIG_CPU_IDLE_MULTIPLE_DRIVERS

/**

* __cpuidle_get_cpu_driver - return the cpuidle driver tied to a CPU.

* @cpu: the CPU handled by the driver

* Returns a pointer to struct cpuidle_driver or NULL if no driver has been

* registered for @cpu.

static struct cpuidle_driver *__cpuidle_get_cpu_driver(int cpu)

{

return per_cpu(cpuidle_drivers, cpu);

}

#else

/**

* __cpuidle_get_cpu_driver - return the global cpuidle driver pointer.

* @cpu: ignored without the multiple driver support

* Return a pointer to a struct cpuidle_driver object or NULL if no driver was

* previously registered.

static inline struct cpuidle_driver *__cpuidle_get_cpu_driver(int cpu)

{

return cpuidle_curr_driver;

}

#endif

3.4 __cpuidle_set_driver - 启用指定的driver

所谓启用，实际就是将这个driver赋值给percpu变量cpuidle_drivers或者全局变量cpuidle_curr_driver

#ifdef CONFIG_CPU_IDLE_MULTIPLE_DRIVERS

/**

* __cpuidle_set_driver - set per CPU driver variables for the given driver.

* @drv: a valid pointer to a struct cpuidle_driver

* For each CPU in the driver's cpumask, unset the registered driver per CPU

* to @drv.

* Returns 0 on success, -EBUSY if the CPUs have driver(s) already.

static inline int __cpuidle_set_driver(struct cpuidle_driver *drv)

{

int cpu;

//1.遍历这个driver控制的所有cpu

for_each_cpu(cpu, drv->cpumask) {

//2.满足下面条件，表示这个cpu已经有一个可用的driver了，

// 这时候设置就失败了，只要有一个设置失败了，就会回滚

if (__cpuidle_get_cpu_driver(cpu)) {

__cpuidle_unset_driver(drv);

return -EBUSY;

}

//3.设置新的driver

per_cpu(cpuidle_drivers, cpu) = drv;

}

return 0;

}

#else

/**

* __cpuidle_set_driver - assign the global cpuidle driver variable.

* @drv: pointer to a struct cpuidle_driver object

* Returns 0 on success, -EBUSY if the driver is already registered.

static inline int __cpuidle_set_driver(struct cpuidle_driver *drv)

{

if (cpuidle_curr_driver)

return -EBUSY;

cpuidle_curr_driver = drv;

return 0;

}

#endif

3.5 __cpuidle_unset_driver - 禁用指定的driver

#ifdef CONFIG_CPU_IDLE_MULTIPLE_DRIVERS

/**

* __cpuidle_unset_driver - unset per CPU driver variables.

* @drv: a valid pointer to a struct cpuidle_driver

* For each CPU in the driver's CPU mask, unset the registered driver per CPU

* variable. If @drv is different from the registered driver, the corresponding

* variable is not cleared.

static inline void __cpuidle_unset_driver(struct cpuidle_driver *drv)

{

int cpu;

//1.遍历所有的cpu

for_each_cpu(cpu, drv->cpumask) {

//2.找到要卸载的driver

if (drv != __cpuidle_get_cpu_driver(cpu))

continue;

//3.设为NULL

per_cpu(cpuidle_drivers, cpu) = NULL;

}

#else

/**

* __cpuidle_unset_driver - unset the global cpuidle driver variable.

* @drv: a pointer to a struct cpuidle_driver

* Reset the global cpuidle variable to NULL. If @drv does not match the

* registered driver, do nothing.

static inline void __cpuidle_unset_driver(struct cpuidle_driver *drv)

{

if (drv == cpuidle_curr_driver)

cpuidle_curr_driver = NULL;

}

#endif

3.6 __cpuidle_register_driver - 注册并启用这个driver

/**

* __cpuidle_register_driver: register the driver

* @drv: a valid pointer to a struct cpuidle_driver

* Do some sanity checks, initialize the driver, assign the driver to the

* global cpuidle driver variable(s) and set up the broadcast timer if the

* cpuidle driver has some states that shut down the local timer.

* Returns 0 on success, a negative error code otherwise:

* * -EINVAL if the driver pointer is NULL or no idle states are available

* * -ENODEV if the cpuidle framework is disabled

* * -EBUSY if the driver is already assigned to the global variable(s)

static int __cpuidle_register_driver(struct cpuidle_driver *drv)

{

int ret;

//1.要注册的driver里面要有可用的C state

if (!drv || !drv->state_count)

return -EINVAL;

//2.coupled校验driver参数的有效性

ret = cpuidle_coupled_state_verify(drv);

if (ret)

return ret;

//3.判断cpuidle子系统是否已经被disable掉

if (cpuidle_disabled())

return -ENODEV;

//4.完成数据结构中的一些初始化

__cpuidle_driver_init(drv);

//5.启用这个driver，所谓启用，实际就是将这个driver赋值给

// percpu变量cpuidle_drivers或者全局变量cpuidle_curr_driver

ret = __cpuidle_set_driver(drv);

if (ret)

return ret;

//6.broadcast_timer相关

// cpuidle state中flasg被打上CPUIDLE_FLAG_TIMER_STOP标签时，说明对应

// 的CPU在进入idle state时，会停掉该CPU的local timer，此时kernel的时

// 间子系统便不能再依赖本CPU的local timer，针对这种情况，设计者会提供一个

// broadcast timer，该timer独立于所有CPU运行，并可以把tick广播到每个CPU

// 上，因而不受idle state的影响。因此，如果cpuidle state具有STOP TIMER

// 的特性的话，需要在driver注册时，提供下面回调函数接口，告知clock events

// 模块，打开broadcast timer

// drv->bctimer在__cpuidle_driver_ini中设置

// 根据drv->bctimer的状态，调用cpuidle_setup_broadcast_timer接口，打开

// 具体CPU上的broadcat timer

// 其中on_each_cpu_mask可以在指定的CPU上运行函数

if (drv->bctimer)

on_each_cpu_mask(drv->cpumask, cpuidle_setup_broadcast_timer,

(void *)1, 1);

return 0;

}

3.6.1 __cpuidle_driver_init - 对cpuidle_driver结构完成初始化

/**

* __cpuidle_driver_init - initialize the driver's internal data

* @drv: a valid pointer to a struct cpuidle_driver

static void __cpuidle_driver_init(struct cpuidle_driver *drv)

{

int i;

* Use all possible CPUs as the default, because if the kernel boots

* with some CPUs offline and then we online one of them, the CPU

* notifier has to know which driver to assign.

//1.如果这个driver没有指定cpu的话，则默认指定系统中所有可能的cpu

if (!drv->cpumask)

drv->cpumask = (struct cpumask *)cpu_possible_mask;

//2.遍历每一个C state，完成相应的初始化

for (i = 0; i < drv->state_count; i++) {

struct cpuidle_state *s = &drv->states[i];

* Look for the timer stop flag in the different states and if

* it is found, indicate that the broadcast timer has to be set

* up.

//2.1 如果设置了该标记位，则表示在进入该C state之前，

// 必须开启broadcast timer，则置位bctimer，后面会用

// 注意：只要这个driver中有一个state被打上该标记，都会设置bctimer

if (s->flags & CPUIDLE_FLAG_TIMER_STOP)

drv->bctimer = 1;

* The core will use the target residency and exit latency

* values in nanoseconds, but allow drivers to provide them in

* microseconds too.

//2.2 "最小滞留时间"单位转化成ns

if (s->target_residency > 0)

s->target_residency_ns = s->target_residency * NSEC_PER_USEC;

//2.2 退出延时的单位转化成ns

if (s->exit_latency > 0)

s->exit_latency_ns = s->exit_latency * NSEC_PER_USEC;

}

3.6.2 cpuidle_setup_broadcast_timer

/**

* cpuidle_setup_broadcast_timer - enable/disable the broadcast timer on a cpu

* @arg: a void pointer used to match the SMP cross call API

* If @arg is NULL broadcast is disabled otherwise enabled

* This function is executed per CPU by an SMP cross call. It's not

* supposed to be called directly.

static void cpuidle_setup_broadcast_timer(void *arg)

{

if (arg)

tick_broadcast_enable();

else

tick_broadcast_disable();

}

3.6.2 on_each_cpu_mask - 在指定的cpu上运行函数

U:\linux-5.10.61\kernel\smp.c

/**

* on_each_cpu_mask(): Run a function on processors specified by

* cpumask, which may include the local processor.

* @mask: The set of cpus to run on (only runs on online subset).

* @func: The function to run. This must be fast and non-blocking.

* @info: An arbitrary pointer to pass to the function.

* @wait: If true, wait (atomically) until function has completed

* on other CPUs.

* If @wait is true, then returns once @func has returned.

* You must not call this function with disabled interrupts or from a

* hardware interrupt handler or from a bottom half handler. The

* exception is that it may be used during early boot while

* early_boot_irqs_disabled is set.

void on_each_cpu_mask(

const struct cpumask *mask, //要在哪些cpu上运行

smp_call_func_t func, //要运行的函数

void *info, //上面函数传入的参数

bool wait) //是否需要等待所有cpu执行完毕

{

int cpu = get_cpu();

//1.在指定的cpu调用函数

smp_call_function_many(mask, func, info, wait);

//2.在local cpu上调用

if (cpumask_test_cpu(cpu, mask)) {

unsigned long flags;

local_irq_save(flags);

func(info);

local_irq_restore(flags);

}

put_cpu();

}

U:\linux-5.10.61\kernel\up.c

* Note we still need to test the mask even for UP

* because we actually can get an empty mask from

* code that on SMP might call us without the local

* CPU in the mask.

void on_each_cpu_mask(

const struct cpumask *mask, //要在哪些cpu上执行

smp_call_func_t func, //要执行的函数

void *info, //上面函数中传入的参数

bool wait) //是否需要等待上面函数在所有cpu上执行完毕

{

unsigned long flags;

if (cpumask_test_cpu(0, mask)) {

local_irq_save(flags);

func(info);

local_irq_restore(flags);

}

3.7 cpuidle_register_driver - 注册driver，并启动这个driver和对应的governor

/**

* cpuidle_register_driver - registers a driver

* @drv: a pointer to a valid struct cpuidle_driver

* Register the driver under a lock to prevent concurrent attempts to

* [un]register the driver from occuring at the same time.

* Returns 0 on success, a negative error code (returned by

* __cpuidle_register_driver()) otherwise.

int cpuidle_register_driver(struct cpuidle_driver *drv)

{

struct cpuidle_governor *gov;

int ret;

spin_lock(&cpuidle_driver_lock);

//1.注册驱动，并启用这个driver

ret = __cpuidle_register_driver(drv);

spin_unlock(&cpuidle_driver_lock);

//2.如果上面在注册这个driver的时候，也成功的启用了这个driver，

// 则需要将governor也切换到这个driver所使用的governor

// 目前还没有发现内核中有对param_governor设置的地方，暂且认为他是全空的值

if (!ret && !strlen(param_governor) && drv->governor &&

(cpuidle_get_driver() == drv)) {

mutex_lock(&cpuidle_lock);

//2.1 根据governor的名称找到对应的governor

// 并切换到这个新的governoe

gov = cpuidle_find_governor(drv->governor);

if (gov) {

cpuidle_prev_governor = cpuidle_curr_governor;

if (cpuidle_switch_governor(gov) < 0)

cpuidle_prev_governor = NULL;

}

mutex_unlock(&cpuidle_lock);

}

return ret;

}

3.8 cpuidle_register - 注册driver+device

/**

* cpuidle_register: registers the driver and the cpu devices with the

* coupled_cpus passed as parameter. This function is used for all common

* initialization pattern there are in the arch specific drivers. The

* devices is globally defined in this file.

* @drv : a valid pointer to a struct cpuidle_driver

* @coupled_cpus: a cpumask for the coupled states

* Returns 0 on success, < 0 otherwise

int cpuidle_register(struct cpuidle_driver *drv,

const struct cpumask *const coupled_cpus)

{

int ret, cpu;

struct cpuidle_device *device;

//1.注册driver，并启动这个driver和对应的governor

ret = cpuidle_register_driver(drv);

if (ret) {

pr_err("failed to register cpuidle driver\n");

return ret;

}

//2.遍历这个driver管理的cpu，初始他们的cpuidle_device结构

// 并注册这些device

for_each_cpu(cpu, drv->cpumask) {

device = &per_cpu(cpuidle_dev, cpu);

device->cpu = cpu;

#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED

* On multiplatform for ARM, the coupled idle states could be

* enabled in the kernel even if the cpuidle driver does not

* use it. Note, coupled_cpus is a struct copy.

if (coupled_cpus)

device->coupled_cpus = *coupled_cpus;

#endif

//2.1 注册device

ret = cpuidle_register_device(device);

if (!ret)

continue;

pr_err("Failed to register cpuidle device for cpu%d\n", cpu);

cpuidle_unregister(drv);

break;

}

return ret;

}

四、系统中cpuidle_governor的组织关系

系统中所有governor挂在该链表上，可通过cpuidle_curr_governor找到系统当前正在使用的governor

4.1 cpuidle_governors - 系统中所有governor挂在该链表上

LIST_HEAD(cpuidle_governors);

4.2 cpuidle_curr_governor - 系统当前正在使用的governor

struct cpuidle_governor *cpuidle_curr_governor;

4.3 cpuidle_prev_governor - 系统上一次正在使用的governor

struct cpuidle_governor *cpuidle_prev_governor;

4.4 cpuidle_register_governor - 注册一个governor

/**

* cpuidle_register_governor - registers a governor

* @gov: the governor

int cpuidle_register_governor(struct cpuidle_governor *gov)

{

int ret = -EEXIST;

if (!gov || !gov->select)

return -EINVAL;

if (cpuidle_disabled())

return -ENODEV;

mutex_lock(&cpuidle_lock);

//1.如果这个governor确实没有注册，则执行下面的注册动作

if (cpuidle_find_governor(gov->name) == NULL) {

ret = 0;

//2.将这个governor添加进系统的链表中

list_add_tail(&gov->governor_list, &cpuidle_governors);

//3.如果系统中当前没有governor可用，或者新注册进来的governor的rating更高

// 则切换到这个新注册的governor使用，（governor中的rating表示改governor

// 的级别，内核默认会选择系统中rating值最大的governor作为当前governor）

// strncasecmp表示不区分大小写比较两个字符串是否相等

// 目前还没有发现内核中有对param_governor设置的地方，暂且认为他是全空的值

if (!cpuidle_curr_governor ||

!strncasecmp(param_governor, gov->name, CPUIDLE_NAME_LEN) ||

(cpuidle_curr_governor->rating < gov->rating &&

strncasecmp(param_governor, cpuidle_curr_governor->name, CPUIDLE_NAME_LEN)))

cpuidle_switch_governor(gov);

}

mutex_unlock(&cpuidle_lock);

return ret;

}

关于param_governor全局变量，可以参见下面的patch，允许在安装驱动的时候通过命令行指定一个governor，通过这种方法可以替换一个默认的governor，但是一般我们手机中都是NULL

4.5 cpuidle_find_governor - 由name找到对应的governor

系统中的所有governor挂在一个全局链表cpuidle_governors上，通过name匹配查找

/**

* cpuidle_find_governor - finds a governor of the specified name

* @str: the name

* Must be called with cpuidle_lock acquired.

struct cpuidle_governor *cpuidle_find_governor(const char *str)

{

struct cpuidle_governor *gov;

//遍历系统中所有governor，找到name一致的governor

list_for_each_entry(gov, &cpuidle_governors, governor_list)

if (!strncasecmp(str, gov->name, CPUIDLE_NAME_LEN))

return gov;

return NULL;

}

4.6 cpuidle_switch_governor - 切换到指定的governor

/**

* cpuidle_switch_governor - changes the governor

* @gov: the new target governor

* Must be called with cpuidle_lock acquired.

int cpuidle_switch_governor(struct cpuidle_governor *gov)

{

struct cpuidle_device *dev;

if (!gov)

return -EINVAL;

//1.如果这个gov就是当前正在使用的governor，则退出

if (gov == cpuidle_curr_governor)

return 0;

//2.在切换governor之前需要唤醒所有已经idle的cpu

// 为啥捏，因为这些cpu是用old governor的判断标准决定是否需要进入idle，

// 以及进入哪一个C state的，现在切换到new governor，游戏规则变了，这个

// cpu是否需要进入idle，以及进入到哪一个C state都不确定，所以先唤醒，后

// 面cpu根据自己的需要，重新走进入idle的流程

cpuidle_uninstall_idle_handler();

if (cpuidle_curr_governor) {

//3.cpuidle_detected_devices全局链表上挂着系统中所有的device

// 先将这些device全部disable掉

list_for_each_entry(dev, &cpuidle_detected_devices, device_list)

cpuidle_disable_device(dev);

}

//4.指定当前正在使用的governor

cpuidle_curr_governor = gov;

if (gov) {

//5.重新使能系统中的所有device

list_for_each_entry(dev, &cpuidle_detected_devices, device_list)

cpuidle_enable_device(dev);

//6.这里实际只是设置全局变量initialized

// 之后上面已经被唤醒的cpu再根据自己的需要重新判断是否需要进入idle

cpuidle_install_idle_handler();

printk(KERN_INFO "cpuidle: using governor %s\n", gov->name);

}

return 0;

}

五、挑选C state

挑选C state本来应该是governor要完成的工作，governor根据当前系统的运行情况，结合自己的游戏规则，挑选出一个合适的C state；但是也有一种场景，cpuidle子系统需要直接进入睡眠最深的idle状态，而不管governor的游戏规则。

所有cpuidle在core层提供了两类接口用于挑选C state，下面分别介绍

5.1 cpuidle_find_deepest_state - 挑选出睡眠最深的C state

从driver支持的所有C state中，挑选出睡眠最深的C state

/**

* cpuidle_find_deepest_state - Find the deepest available idle state.

* @drv: cpuidle driver for the given CPU.

* @dev: cpuidle device for the given CPU.

* @latency_limit_ns: Idle state exit latency limit

* Return: the index of the deepest available idle state.

int cpuidle_find_deepest_state(

struct cpuidle_driver *drv,

struct cpuidle_device *dev,

u64 latency_limit_ns) //找出的C state退出延迟不能超过这个值

{

return find_deepest_state(drv, dev, latency_limit_ns, 0, false);

}

5.2 find_deepest_state - 找出睡眠最深的C state

static int find_deepest_state(

struct cpuidle_driver *drv,

struct cpuidle_device *dev,

u64 max_latency_ns, //找到的C state，退出延迟最大不能超过该值

unsigned int forbidden_flags, //找到的C state，不能带有该标记位

bool s2idle) //找到的C state，是否一定要包含enter_s2idle回调函数

{

u64 latency_req = 0;

int i, ret = 0;

//1.遍历drv中所有的C state

// 注意：

// a) 从前向后遍历，也就是从睡眠最浅的开始遍历

// b) 这里是从1开始遍历，因为0是normal state，直接跳过

for (i = 1; i < drv->state_count; i++) {

struct cpuidle_state *s = &drv->states[i];

//2.过滤掉不满足条件的C state

if (dev->states_usage[i].disable ||

s->exit_latency_ns <= latency_req || //退出延迟越大，睡得越深

s->exit_latency_ns > max_latency_ns || //退出延迟最大不能超过该值

(s->flags & forbidden_flags) || //找到的C state，不能带有该标记位

(s2idle && !s->enter_s2idle)) //找到的C state，是否一定要包含enter_s2idle回调函数

continue;

//3.代码走到这里，表示上面所有的条件都满足了，并且睡眠是最深的

// 保留满足条件的C state的退出延迟，并记录C state等级

latency_req = s->exit_latency_ns;

ret = i;

}

//4.返回对应的idle等级的索引

return ret;

}

5.3 cpuidle_select - 根据系统运行状况，选择一个合适的C state

具体依据哪些因素去挑选，由governor决定

/**

* cpuidle_select - ask the cpuidle framework to choose an idle state

* @drv: the cpuidle driver

* @dev: the cpuidle device

* @stop_tick: indication on whether or not to stop the tick

* Returns the index of the idle state. The return value must not be negative.

* The memory location pointed to by @stop_tick is expected to be written the

* 'false' boolean value if the scheduler tick should not be stopped before

* entering the returned state.

int cpuidle_select(

struct cpuidle_driver *drv,

struct cpuidle_device *dev,

bool *stop_tick) //是否需要停止tick

{

//依据什么标准去选择C state，由governor自己决定

return cpuidle_curr_governor->select(drv, dev, stop_tick);

}

六、进入C state

cpuidle子系统在core层也提供了一些接口，idle线程通过调用下面的接口，就能进入到指定的C state中睡眠

6.1 call_cpuidle - 进入指定级别的C state

static int call_cpuidle(

struct cpuidle_driver *drv,

struct cpuidle_device *dev,

int next_state) //要进入的C state

{

* The idle task must be scheduled, it is pointless to go to idle, just

* update no idle residency and return.

//1.判断是否有其他线程需要调度

// 如果系统中还有其他线程等待调度，说明系统现在处于忙状态，不能进入idle

// 注意：由上一章对idle线程的分析可知，只有在idle中才会调用这个函数的，

// 所以此时的current是idle线程

if (current_clr_polling_and_test()) {

dev->last_residency_ns = 0;

local_irq_enable();

return -EBUSY;

}

* Enter the idle state previously returned by the governor decision.

* This function will block until an interrupt occurs and will take

* care of re-enabling the local interrupts

//2.进入指定级别的C state

return cpuidle_enter(drv, dev, next_state);

}

6.2 cpuidle_enter - 进入指定级别的C state

/**

* cpuidle_enter - enter into the specified idle state

* @drv: the cpuidle driver tied with the cpu

* @dev: the cpuidle device

* @index: the index in the idle state table

* Returns the index in the idle state, < 0 in case of error.

* The error code depends on the backend driver

int cpuidle_enter(

struct cpuidle_driver *drv,

struct cpuidle_device *dev,

int index) //要进入的C state

{

int ret = 0;

* Store the next hrtimer, which becomes either next tick or the next

* timer event, whatever expires first. Additionally, to make this data

* useful for consumers outside cpuidle, we rely on that the governor's

* ->select() callback have decided, whether to stop the tick or not.

//1.睡眠前，记录下一个即将触发的高精度定时器

WRITE_ONCE(dev->next_hrtimer, tick_nohz_get_next_hrtimer());

//2.进入指定级别的idle

// coupled为"耦合状态"相关，这里我们先不分析，直接进入cpuidle_enter_state

if (cpuidle_state_is_coupled(drv, index))

ret = cpuidle_enter_state_coupled(dev, drv, index);

else

ret = cpuidle_enter_state(dev, drv, index);

//3.结束睡眠后，将该值清零

WRITE_ONCE(dev->next_hrtimer, 0);

return ret;

}

6.3 cpuidle_enter_state - 进入指定级别的C state

/**

* cpuidle_enter_state - enter the state and update stats

* @dev: cpuidle device for this cpu

* @drv: cpuidle driver for this cpu

* @index: index into the states table in @drv of the state to enter

int cpuidle_enter_state(

struct cpuidle_device *dev,

struct cpuidle_driver *drv,

int index) //要进入的C state

{

int entered_state;

//1.根据索引找到对于的idle state

struct cpuidle_state *target_state = &drv->states[index];

//2.在该C state下，local timer会不会被停掉，

// 如果被停掉的话则需要使能broascast机制

bool broadcast = !!(target_state->flags & CPUIDLE_FLAG_TIMER_STOP);

ktime_t time_start, time_end;

* Tell the time framework to switch to a broadcast timer because our

* local timer will be shut down. If a local timer is used from another

* CPU as a broadcast timer, this call may fail if it is not available.

//3.考虑到broadcast，重新选择C state

// 由上面的注释的含义，这里是告诉时间子系统：因为local cpu的timer即将关闭，

// 下面要进入broadcast模式了，如果这个cpu的定时器需要作为其他cpu的broadcast

// 的timer时候，下面的调用将会失败

if (broadcast && tick_broadcast_enter()) {

//3.1 下面重新找出一个C state，同时满足下面条件、睡眠最深的C state

// a) 退出延迟小于target_state->exit_latency_ns

// b) 不能带有CPUIDLE_FLAG_TIMER_STOP标记

index = find_deepest_state(drv, dev, target_state->exit_latency_ns,

CPUIDLE_FLAG_TIMER_STOP, false);

if (index < 0) {

//3.2 没有找到合适的C state，调用default_idle_call进入idle

// arm64实际就是执行wfi指令进入睡眠，trace标记的idx为1

default_idle_call();

return -EBUSY;

}

target_state = &drv->states[index];

broadcast = false;

}

//4.leave_mm只有在x86的一些平台中使能，表示在进idle之前刷TLB

if (target_state->flags & CPUIDLE_FLAG_TLB_FLUSHED)

leave_mm(dev->cpu);

/* Take note of the planned idle state. */

//5.在rq结构中记录当前cpu所处的idle_state等级

sched_idle_set_state(target_state);

//6.打印trace相关

trace_cpu_idle(index, dev->cpu);

//7.记录进入idle的时间戳，用于后面统计在idle中呆了多长时间

time_start = ns_to_ktime(local_clock());

//8.通知rcu子系统，后面分析

stop_critical_timings();

if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE))

rcu_idle_enter();

//8.调用下面函数进入idle状态，直到从idle状态中退出时该函数才会返回

// 返回值表示刚刚是从那个状态中退出的

entered_state = target_state->enter(dev, drv, index);

//9.代码走到这里表示已经从idle中退出了

if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE))

rcu_idle_exit();

start_critical_timings();

sched_clock_idle_wakeup_event();

//10.记录退出idle的时间戳，用于统计在这个C state中呆了多长时间

time_end = ns_to_ktime(local_clock());

trace_cpu_idle(PWR_EVENT_EXIT, dev->cpu);

/* The cpu is no longer idle or about to enter idle. */

//11.在rq结构中记录当前cpu所处的idle_state等级，NULL表示已经退出idle等级了

sched_idle_set_state(NULL);

//12.下面这些广播相关的目前看不懂，以后再分析吧

if (broadcast) {

if (WARN_ON_ONCE(!irqs_disabled()))

local_irq_disable();

tick_broadcast_exit();

}

if (!cpuidle_state_is_coupled(drv, index))

local_irq_enable();

//10.上面返回的entered_state表示刚刚是从哪一个C state中退出的，下面需要做一些统计的工作

if (entered_state >= 0) {

//10.1 从该状态退出所需的延时

s64 diff, delay = drv->states[entered_state].exit_latency_ns;

int i;

* Update cpuidle counters

* This can be moved to within driver enter routine,

* but that results in multiple copies of same code.

//10.2 这个时间差表示在这个idle state中一共呆了多长时间

diff = ktime_sub(time_end, time_start);

//10.3 统计信息

dev->last_residency_ns = diff;

dev->states_usage[entered_state].time_ns += diff;

dev->states_usage[entered_state].usage++;

//10.3 target_residency_ns表示该C state的最小滞留时间，而diff表示实际滞留时间

if (diff < drv->states[entered_state].target_residency_ns) {

//10.4 满足该条件表示在该C state下的实际滞留时间，小于该C state的最小滞留时间

// 满足该条件是用于C state挑选不合理造成，也就是选择了深睡眠，但是实际睡眠不够

// 也就是说：站在"最小滞留时间"的角度，idx比entered_state小的所有C state

// 都能满足要求，（idx小的还能获取到最佳的性能，因为退出延时短啊，这里忽视和"功

// 耗"，仅从"性能"角度考虑）

// 需要注意的是，索引值越小，睡眠越浅，idle等级越高，功耗越低，

// 相应的唤醒的延迟就越大，也就是说功耗按照降序排列

for (i = entered_state - 1; i >= 0; i--) {

if (dev->states_usage[i].disable)

continue;

/* Shallower states are enabled, so update. */

//10.5 above记录系统从启动到现在，选择"比当前C state睡眠深"的总次数

dev->states_usage[entered_state].above++;

break;

}

} else if (diff > delay) {

//10.5 满足该条件表示在该C state下的实际滞留时间diff，大于该C state的退出延迟

// 满足该条件是正常条件

for (i = entered_state + 1; i < drv->state_count; i++) {

if (dev->states_usage[i].disable)

continue;

* Update if a deeper state would have been a

* better match for the observed idle duration.

//10.6 注意，上面在记录diff的时候，是包含在idle中的退出延迟的

// 两者的差表示在这个C state中"实际睡眠时间"，

// 如果这个"实际睡眠时间"大于"最小滞留时间"（因为是向上遍历的），

// 则说明有更深的C state符合当前系统的需求（站在更加节能的角度），

// 但是却偏偏选择了比自己睡眠浅的C state的次数

if (diff - delay >= drv->states[i].target_residency_ns)

dev->states_usage[entered_state].below++;

break;

}

} else {

//11.代码走到这里，说明上次在请求进入index这个C state失败了，

// last_residency_ns表示上一次在睡眠状态睡了多长时间

// rejected表示：请求进入该idx对应的idle等级被拒绝的次数

dev->last_residency_ns = 0;

dev->states_usage[index].rejected++;

}

//12.返回进入的idle等级

return entered_state;

}

6.3.1 sched_idle_set_state - 在rq中记录当前所处于的C state级别

/**

* sched_idle_set_state - Record idle state for the current CPU.

* @idle_state: State to record.

void sched_idle_set_state(struct cpuidle_state *idle_state)

{

idle_set_state(this_rq(), idle_state);

}

6.3.2 idle_set_state - 在rq中记录当前所处于的C state级别

static inline void idle_set_state(struct rq *rq,

struct cpuidle_state *idle_state)

{

rq->idle_state = idle_state;

}

6.4 cpuidle_play_dead - 进入允许的最深的睡眠状态

/**

* cpuidle_play_dead - cpu off-lining

* Returns in case of an error or no driver

int cpuidle_play_dead(void)

{

struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);

struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);

int i;

if (!drv)

return -ENODEV;

/* Find lowest-power state that supports long-term idle */

//1.注意：这里是从idle等级最深的哪一级开始向前遍历，

// 也就是优先选择功耗最小的idle等级

for (i = drv->state_count - 1; i >= 0; i--)

if (drv->states[i].enter_dead)

return drv->states[i].enter_dead(dev, i);

return -ENODEV;

}

七、sysfs层接口

7.1 /sys/devices/system/cpu/cpuidle下的节点

文件位置：W:\opensource\linux-5.10.61\drivers\cpuidle\sysfs.c

static DEVICE_ATTR(available_governors, 0444, show_available_governors, NULL);

static DEVICE_ATTR(current_driver, 0444, show_current_driver, NULL);

static DEVICE_ATTR(current_governor, 0644, show_current_governor, store_current_governor);

static DEVICE_ATTR(current_governor_ro, 0444, show_current_governor, NULL);

7.1.1 available_governors - 查看系统中所有可用的governor

show_available_governors实现如下，遍历系统中的所有governor，打印出name

static ssize_t show_available_governors(

struct device *dev,

struct device_attribute *attr,

char *buf)

{

ssize_t i = 0;

struct cpuidle_governor *tmp;

mutex_lock(&cpuidle_lock);

//1.系统中的所有governor全部挂在这个cpuidle_governors全局链表上

list_for_each_entry(tmp, &cpuidle_governors, governor_list) {

//2.这里的i用于控制数据量太大，可以学习一下

if (i >= (ssize_t) (PAGE_SIZE - (CPUIDLE_NAME_LEN + 2)))

goto out;

//3.打印出governor的名称

i += scnprintf(&buf[i], CPUIDLE_NAME_LEN + 1, "%s ", tmp->name);

}

out:

i+= sprintf(&buf[i], "\n");

mutex_unlock(&cpuidle_lock);

return i;

}

7.1.2 current_driver - 查看当前cpu当前正在使用的driver

show_current_driver实现如下，查看当前cpu正在使用的driver

static ssize_t show_current_driver(

struct device *dev,

struct device_attribute *attr,

char *buf)

{

ssize_t ret;

struct cpuidle_driver *drv;

spin_lock(&cpuidle_driver_lock);

//1.获取当前cpu正在使用的driver，打印出name

drv = cpuidle_get_driver();

if (drv)

ret = sprintf(buf, "%s\n", drv->name);

else

ret = sprintf(buf, "none\n");

spin_unlock(&cpuidle_driver_lock);

return ret;

}

7.1.3 current_governor - 查看/设置系统中正在使用的governor

show_current_governor查看系统中当前正在使用的governor

static ssize_t show_current_governor(struct device *dev,

struct device_attribute *attr,

char *buf)

{

ssize_t ret;

mutex_lock(&cpuidle_lock);

//1.系统中正在使用的governor，保存在全局变量cpuidle_curr_governor中

if (cpuidle_curr_governor)

ret = sprintf(buf, "%s\n", cpuidle_curr_governor->name);

else

ret = sprintf(buf, "none\n");

mutex_unlock(&cpuidle_lock);

return ret;

}

store_current_governor实现如下，切换到指定的governor

static ssize_t store_current_governor(struct device *dev,

struct device_attribute *attr,

const char *buf, size_t count)

{

char gov_name[CPUIDLE_NAME_LEN + 1];

int ret;

struct cpuidle_governor *gov;

ret = sscanf(buf, "%" __stringify(CPUIDLE_NAME_LEN) "s", gov_name);

if (ret != 1)

return -EINVAL;

mutex_lock(&cpuidle_lock);

ret = -EINVAL;

//找到用户指定的governor，并切换使能

list_for_each_entry(gov, &cpuidle_governors, governor_list) {

if (!strncmp(gov->name, gov_name, CPUIDLE_NAME_LEN)) {

ret = cpuidle_switch_governor(gov);

break;

}

mutex_unlock(&cpuidle_lock);

return ret ? ret : count;

}

7.1.4 current_governor_ro - 显示当前正在使用的governor

show_current_governor实现如下，显示系统中正在使用的governor

static ssize_t show_current_governor(struct device *dev,

struct device_attribute *attr,

char *buf)

{

ssize_t ret;

mutex_lock(&cpuidle_lock);

//正在使用的governor保存在全局变量cpuidle_curr_governor中

if (cpuidle_curr_governor)

ret = sprintf(buf, "%s\n", cpuidle_curr_governor->name);

else

ret = sprintf(buf, "none\n");

mutex_unlock(&cpuidle_lock);

return ret;

}

7.2 /sys/devices/system/cpu/cpuN/cpuidle/stateX下的节点

记录cpu在各层C state中睡眠了多长时间，这个时间是从开机到现在时间

7.2.1 cpuidle_add_sysfs

这里仅仅是在/sys/devices/system/cpu/cpuN/下创建一个名为cpuidle的文件夹

/**

* cpuidle_add_sysfs - creates a sysfs instance for the target device

* @dev: the target device

int cpuidle_add_sysfs(struct cpuidle_device *dev)

{

struct cpuidle_device_kobj *kdev;

struct device *cpu_dev = get_cpu_device((unsigned long)dev->cpu);

int error;

* Return if cpu_device is not setup for this CPU.

* This could happen if the arch did not set up cpu_device

* since this CPU is not in cpu_present mask and the

* driver did not send a correct CPU mask during registration.

* Without this check we would end up passing bogus

* value for &cpu_dev->kobj in kobject_init_and_add()

if (!cpu_dev)

return -ENODEV;

kdev = kzalloc(sizeof(*kdev), GFP_KERNEL);

if (!kdev)

return -ENOMEM;

kdev->dev = dev;

dev->kobj_dev = kdev;

init_completion(&kdev->kobj_unregister);

error = kobject_init_and_add(&kdev->kobj, &ktype_cpuidle, &cpu_dev->kobj,

"cpuidle");

if (error) {

kobject_put(&kdev->kobj);

return error;

}

kobject_uevent(&kdev->kobj, KOBJ_ADD);

return 0;

}

7.2.2 cpuidle_add_device_sysfs

/**

* cpuidle_add_device_sysfs - adds device specific sysfs attributes

* @device: the target device

int cpuidle_add_device_sysfs(struct cpuidle_device *device)

{

int ret;

//1.创建/sys/devices/system/cpu/cpuN/cpuidle/stateX下的节点

ret = cpuidle_add_state_sysfs(device);

if (ret)

return ret;

//2.创建/sys/devices/system/cpu/cpuN/cpuidle/driver下的节点

ret = cpuidle_add_driver_sysfs(device);

if (ret)

cpuidle_remove_state_sysfs(device);

return ret;

}

7.2.3 cpuidle_add_state_sysfs

/**

* cpuidle_add_state_sysfs - adds cpuidle states sysfs attributes

* @device: the target device

static int cpuidle_add_state_sysfs(struct cpuidle_device *device)

{

int i, ret = -ENOMEM;

struct cpuidle_state_kobj *kobj;

struct cpuidle_device_kobj *kdev = device->kobj_dev;

struct cpuidle_driver *drv = cpuidle_get_cpu_driver(device);

/* state statistics */

for (i = 0; i < drv->state_count; i++) {

kobj = kzalloc(sizeof(struct cpuidle_state_kobj), GFP_KERNEL);

if (!kobj) {

ret = -ENOMEM;

goto error_state;

}

kobj->state = &drv->states[i];

kobj->state_usage = &device->states_usage[i];

kobj->device = device;

init_completion(&kobj->kobj_unregister);

//创建stateN路径

ret = kobject_init_and_add(&kobj->kobj, &ktype_state_cpuidle,

&kdev->kobj, "state%d", i);

if (ret) {

kobject_put(&kobj->kobj);

goto error_state;

}

//在路径中创建节点

cpuidle_add_s2idle_attr_group(kobj);

kobject_uevent(&kobj->kobj, KOBJ_ADD);

device->kobjs[i] = kobj;

}

return 0;

error_state:

for (i = i - 1; i >= 0; i--)

cpuidle_free_state_kobj(device, i);

return ret;

}

其中ktype_state_cpuidle实现如下：

static struct kobj_type ktype_state_cpuidle = {

.sysfs_ops = &cpuidle_state_sysfs_ops,

.default_attrs = cpuidle_state_default_attrs,

.release = cpuidle_state_sysfs_release,

};

节点如下：

define_one_state_ro(name, show_state_name);

define_one_state_ro(desc, show_state_desc);

define_one_state_ro(latency, show_state_exit_latency);

define_one_state_ro(residency, show_state_target_residency);

define_one_state_ro(power, show_state_power_usage);

define_one_state_ro(usage, show_state_usage);

define_one_state_ro(rejected, show_state_rejected);

define_one_state_ro(time, show_state_time);

define_one_state_rw(disable, show_state_disable, store_state_disable);

define_one_state_ro(above, show_state_above);

define_one_state_ro(below, show_state_below);

define_one_state_ro(default_status, show_state_default_status);

static struct attribute *cpuidle_state_default_attrs[] = {

&attr_name.attr,

&attr_desc.attr,

&attr_latency.attr,

&attr_residency.attr,

&attr_power.attr,

&attr_usage.attr,

&attr_rejected.attr,

&attr_time.attr,

&attr_disable.attr,

&attr_above.attr,

&attr_below.attr,

&attr_default_status.attr,

NULL

};

7.2.3.1 above - 返回cpuidle_state_usage->above，深睡眠的次数

define_show_state_ull_function(above)

define_one_state_ro(above, show_state_above);

其中define_show_state_ull_function实现如下：

#define define_show_state_ull_function(_name) \

static ssize_t show_state_##_name(struct cpuidle_state *state, \

struct cpuidle_state_usage *state_usage, \

char *buf) \

{ \

return sprintf(buf, "%llu\n", state_usage->_name);\

}

7.2.3.2 below - 返回cpuidle_state_usage->below，浅睡眠的次数

define_one_state_ro(below, show_state_below);

define_show_state_ull_function(below)

7.2.3.3 name - 返回cpuidle_state->name，C state的名称

define_show_state_str_function(name)

define_one_state_ro(name, show_state_name);

#define define_show_state_str_function(_name) \

static ssize_t show_state_##_name(struct cpuidle_state *state, \

struct cpuidle_state_usage *state_usage, \

char *buf) \

{ \

if (state->_name[0] == '\0')\

return sprintf(buf, "<null>\n");\

return sprintf(buf, "%s\n", state->_name);\

}

7.2.3.4 desc - 返回cpuidle_state->desc，C state的描述信息

define_show_state_str_function(desc)

define_one_state_ro(desc, show_state_desc);

7.2.3.5 disable - 返回cpuidle_state_usage->disable，这个C state是否被禁用

define_one_state_rw(disable, show_state_disable, store_state_disable);

读操作如下：

static ssize_t show_state_disable(struct cpuidle_state *state,

struct cpuidle_state_usage *state_usage,

char *buf)

{

return sprintf(buf, "%llu\n",

state_usage->disable & CPUIDLE_STATE_DISABLED_BY_USER);

}

写操作如下：

static ssize_t store_state_disable(struct cpuidle_state *state,

struct cpuidle_state_usage *state_usage,

const char *buf, size_t size)

{

unsigned int value;

int err;

//权限校验

if (!capable(CAP_SYS_ADMIN))

return -EPERM;

err = kstrtouint(buf, 0, &value);

if (err)

return err;

//标记被用户空间禁用了

if (value)

state_usage->disable |= CPUIDLE_STATE_DISABLED_BY_USER;

else

state_usage->disable &= ~CPUIDLE_STATE_DISABLED_BY_USER;

return size;

}

7.2.3.6 latency - 返回cpuidle_state->exit_latency_ns，该C state的退出延迟，单位us

define_show_state_time_function(exit_latency)

define_one_state_ro(latency, show_state_exit_latency);

读操作实现如下：

#define define_show_state_time_function(_name) \

static ssize_t show_state_##_name(struct cpuidle_state *state, \

struct cpuidle_state_usage *state_usage, \

char *buf) \

{ \

return sprintf(buf, "%llu\n", ktime_to_us(state->_name##_ns)); \

}

7.2.3.7 residency - 返回cpuidle_state->target_residency_ns，该C state的"最小滞留时间"，单位us

define_show_state_time_function(target_residency)

define_one_state_ro(residency, show_state_target_residency);

7.2.3.8 power - 返回cpuidle_state->power_usage，这个cpu在C state下的功耗，单位mW

define_one_state_ro(power, show_state_power_usage);

define_show_state_function(power_usage)

读操作实现如下：

#define define_show_state_function(_name) \

static ssize_t show_state_##_name(struct cpuidle_state *state, \

struct cpuidle_state_usage *state_usage, char *buf) \

{ \

return sprintf(buf, "%u\n", state->_name);\

}

7.2.3.9 time - 返回cpuidle_state->time_ns，这个cpu在这个C state下的总耗时，单位ns

define_one_state_ro(time, show_state_time);

读操作实现如下：

static ssize_t show_state_time(struct cpuidle_state *state,

struct cpuidle_state_usage *state_usage,

char *buf)

{

return sprintf(buf, "%llu\n", ktime_to_us(state_usage->time_ns));

}

7.2.3.10 usage -> 返回cpuidle_state_usage->usage，这个cpu进入该C state的次数

define_one_state_ro(usage, show_state_usage);

define_show_state_ull_function(usage)

7.3 /sys/devices/system/cpu/cpuN/cpuidle/driver下的节点

7.3.1 cpuidle_add_driver_sysfs

/**

* cpuidle_add_driver_sysfs - adds the driver name sysfs attribute

* @dev: the target device

static int cpuidle_add_driver_sysfs(struct cpuidle_device *dev)

{

struct cpuidle_driver_kobj *kdrv;

struct cpuidle_device_kobj *kdev = dev->kobj_dev;

struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);

int ret;

kdrv = kzalloc(sizeof(*kdrv), GFP_KERNEL);

if (!kdrv)

return -ENOMEM;

kdrv->drv = drv;

init_completion(&kdrv->kobj_unregister);

//创建driver目录

ret = kobject_init_and_add(&kdrv->kobj, &ktype_driver_cpuidle,

&kdev->kobj, "driver");

if (ret) {

kobject_put(&kdrv->kobj);

return ret;

}

kobject_uevent(&kdrv->kobj, KOBJ_ADD);

dev->kobj_driver = kdrv;

return ret;

}

其中ktype_driver_cpuidle实现如下：

static struct kobj_type ktype_driver_cpuidle = {

.sysfs_ops = &cpuidle_driver_sysfs_ops,

.default_attrs = cpuidle_driver_default_attrs,

.release = cpuidle_driver_sysfs_release,

};

在driver下的name节点

static struct attribute *cpuidle_driver_default_attrs[] = {

&attr_driver_name.attr,

NULL

};

7.3.1.1 name - 返回，当前cpu正在使用的驱动

define_one_driver_ro(name, show_driver_name);

实现如下

static ssize_t show_driver_name(struct cpuidle_driver *drv, char *buf)

{

ssize_t ret;

spin_lock(&cpuidle_driver_lock);

ret = sprintf(buf, "%s\n", drv ? drv->name : "none");

spin_unlock(&cpuidle_driver_lock);

return ret;

}

八、cpuidle_device的使能和禁用

在一个cpu被使能或者禁用，主要执行下面步骤

在sysfs文件系统中创建节点
调用governor的enable/disable回调函数，完成gonvernor私有数据的初始化和销毁工作
更新全局变量enabled_devices，该变量记录着系统中有多少cpu受cpuidle子系统控制

8.1 cpuidle_enable_device - 使能一个cpu

/**

* cpuidle_enable_device - enables idle PM for a CPU

* @dev: the CPU

* This function must be called between cpuidle_pause_and_lock and

* cpuidle_resume_and_unlock when used externally.

int cpuidle_enable_device(struct cpuidle_device *dev)

{

int ret;

struct cpuidle_driver *drv;

if (!dev)

return -EINVAL;

//1.已经使能则退出

if (dev->enabled)

return 0;

//2.未指定governor则退出

if (!cpuidle_curr_governor)

return -EIO;

//3.获取device对应的driver

drv = cpuidle_get_cpu_driver(dev);

if (!drv)

return -EIO;

if (!dev->registered)

return -EINVAL;

//4.在sysfs文件系统中创建节点

// 创建/sys/devices/system/cpu/cpuN/cpuidle/stateX下的节点

ret = cpuidle_add_device_sysfs(dev);

if (ret)

return ret;

//5.调用governor的enable回调函数，一般完成私有数据的初始化

if (cpuidle_curr_governor->enable) {

ret = cpuidle_curr_governor->enable(drv, dev);

if (ret)

goto fail_sysfs;

}

smp_wmb();

dev->enabled = 1;

//6.全局变量，记录系统中共有多少个cpu受cpuidle子系统管控

enabled_devices++;

return 0;

fail_sysfs:

cpuidle_remove_device_sysfs(dev);

return ret;

}

8.2 cpuidle_disable_device - 禁用一个cpu

该governor->enable回调函数中，一般用于完成governor私有数据的初始化

/**

* cpuidle_disable_device - disables idle PM for a CPU

* @dev: the CPU

* This function must be called between cpuidle_pause_and_lock and

* cpuidle_resume_and_unlock when used externally.

void cpuidle_disable_device(struct cpuidle_device *dev)

{

struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);

//1.已经被disable，则退出

if (!dev || !dev->enabled)

return;

if (!drv || !cpuidle_curr_governor)

return;

dev->enabled = 0;

//2.完成gonvernor私有数据的销毁工作，例如申请的内存的是释放等

if (cpuidle_curr_governor->disable)

cpuidle_curr_governor->disable(drv, dev);

//3.移除sysfs中的节点信息

// 移除/sys/devices/system/cpu/cpuN/cpuidle/stateX下的节点

cpuidle_remove_device_sysfs(dev);

//4.全局变量，记录系统中共有多少个cpu受cpuidle子系统管控

enabled_devices--;

}

九、cpuidle子系统的使能和禁止

9.1 cpuidle_pause - 停用cpuidle子系统

/* Currently used in suspend/resume path to suspend cpuidle */

void cpuidle_pause(void)

{

mutex_lock(&cpuidle_lock);

cpuidle_uninstall_idle_handler();

mutex_unlock(&cpuidle_lock);

}

9.2 cpuidle_uninstall_idle_handler

/**

* cpuidle_uninstall_idle_handler - uninstalls the cpuidle idle loop handler

void cpuidle_uninstall_idle_handler(void)

{

//如果enabled_devices不为0，则说明cpuidle子系统此时正控制着某些cpu，

//当然也可能存在一些cpu已经被cpuidle子系统搞睡眠了，因为cpuidle子系

//统马上就要狗带了，在狗带之前就需要唤醒所有已经idle的cpu

if (enabled_devices) {

initialized = 0;

wake_up_all_idle_cpus();

}

* Make sure external observers (such as the scheduler)

* are done looking at pointed idle states.

synchronize_rcu();

}

9.2.1 wake_up_all_idle_cpus - 唤醒所有idle的cpu

/**

* wake_up_all_idle_cpus - break all cpus out of idle

* wake_up_all_idle_cpus try to break all cpus which is in idle state even

* including idle polling cpus, for non-idle cpus, we will do nothing

* for them.

void wake_up_all_idle_cpus(void)

{

int cpu;

preempt_disable();

//1.遍历所有online的cpu

for_each_online_cpu(cpu) {

//2.当前cpu肯定不是idle的，跳过

if (cpu == smp_processor_id())

continue;

//3.执行唤醒动作

wake_up_if_idle(cpu);

}

preempt_enable();

}

9.2.2 wake_up_if_idle - 如果cpu处于idle状态，就唤醒它

void wake_up_if_idle(int cpu)

{

struct rq *rq = cpu_rq(cpu);

struct rq_flags rf;

rcu_read_lock();

//1.判断一个cpu是不是idle的，就是判断这个cpu上正在执行的线程是不是idle线程

if (!is_idle_task(rcu_dereference(rq->curr)))

goto out;

//2.唤醒cpu

if (set_nr_if_polling(rq->idle)) {

//2.1 进入该分支表示cpu实际上是处于polling忙等的状态，并不是真正意义上的睡眠了，

// 此时只需要将这个idle线程打上_TIF_NEED_RESCHED标记，就会从忙等中退出

trace_sched_wake_idle_without_ipi(cpu);

} else {

rq_lock_irqsave(rq, &rf);

//2.2 代码进入该分支，表示cpu确实是进入睡眠状态，需要发送ipi中断唤醒cpu

if (is_idle_task(rq->curr))

smp_send_reschedule(cpu);

/* Else CPU is not idle, do nothing here: */

rq_unlock_irqrestore(rq, &rf);

}

out:

rcu_read_unlock();

}

9.2.1.1 is_idle_task - 判断一个线程是不是idle线程

/**

* is_idle_task - is the specified task an idle task?

* @p: the task in question.

* Return: 1 if @p is an idle task. 0 otherwise.

static __always_inline bool is_idle_task(const struct task_struct *p)

{

return !!(p->flags & PF_IDLE);

}

9.2.1.2 set_nr_if_polling - 对指定的task设置_TIF_NEED_RESCHED标志

* Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.

* If this returns true, then the idle task promises to call

* sched_ttwu_pending() and reschedule soon.

static bool set_nr_if_polling(struct task_struct *p)

{

//1.传入的p一般就是idle线程，下面从thread_info中提取flags

struct thread_info *ti = task_thread_info(p);

typeof(ti->flags) old, val = READ_ONCE(ti->flags);

for (;;) {

//2.首先判断这个线程是不是处于polling状态

if (!(val & _TIF_POLLING_NRFLAG))

return false;

//3.如果_TIF_NEED_RESCHED标记已经被设置了，就不需要重复设置了

if (val & _TIF_NEED_RESCHED)

return true;

//4.设置_TIF_NEED_RESCHED标志，设置成功后break跳出循环

old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);

if (old == val)

break;

val = old;

}

return true;

}

9.2.1.3 smp_send_reschedule - 平台相关，唤醒指定cpu

arm和arm64实现都是如下：

void smp_send_reschedule(int cpu)

{

//发送ipi中断，唤醒指定cpu

smp_cross_call(cpumask_of(cpu), IPI_RESCHEDULE);

}

9.3 cpuidle_resume - 重新启用cpuidle子系统

由下面的注释可知，该函数是在suspend/resume路径上被调用，也就是先执行相面的cpuidle_pause，然后再执行这里的cpuidle_resume，具体使用的地方我们暂时不深入分析

/* Currently used in suspend/resume path to resume cpuidle */

void cpuidle_resume(void)

{

mutex_lock(&cpuidle_lock);

cpuidle_install_idle_handler();

mutex_unlock(&cpuidle_lock);

}

9.4 cpuidle_install_idle_handler

/**

* cpuidle_install_idle_handler - installs the cpuidle idle loop handler

void cpuidle_install_idle_handler(void)

{

if (enabled_devices) {

/* Make sure all changes finished before we switch to new idle */

smp_wmb();

initialized = 1;

}

十、poll机制（待补充）

涉及的文件：U:\linux-5.10.61\drivers\cpuidle\poll_state.c

poll机制是指使用cpu_relax（yield指令）来完成state0的忙等，目前只有x86支持，打开CONFIG_ARCH_HAS_CPU_RELAX宏的时候才会编译上面的文件

poll idle又称作CPU relax，是一种相对标准的idle state，在诸如64位Power PC等体系结构上，会提供这种state。如定义了CONFIG_ARCH_HAS_CPU_RELAX，cpuidle会在注册cpuidle driver时，自动将driver的state[0]注册为POLL idle state

10.1 cpuidle_poll_state_init

奇怪，arm64中state0不是normal 模式吗，难道在x86中对state0重新定义了

文件位置：U:\linux-5.10.61\drivers\cpuidle\cpuidle.c

void cpuidle_poll_state_init(struct cpuidle_driver *drv)

{

//1.注意，这里是对state0进行设置，也就是normal模式

struct cpuidle_state *state = &drv->states[0];

snprintf(state->name, CPUIDLE_NAME_LEN, "POLL");

snprintf(state->desc, CPUIDLE_DESC_LEN, "CPUIDLE CORE POLL IDLE");

state->exit_latency = 0;

state->target_residency = 0;

state->exit_latency_ns = 0;

state->target_residency_ns = 0;

state->power_usage = -1;

//2.注意：

// 这里是设置的是state0的enter回调函数，也就是说，只要cpu从idle状态退出，

// 就会执行这个回调函数，该回调函数会在core层的cpuidle_enter_state中被调用，

// 并且并标志设置为CPUIDLE_FLAG_POLLING，这个状态是忙等，不是真正意义上的idle

state->enter = poll_idle;

state->flags = CPUIDLE_FLAG_POLLING;

}

关于CPUIDLE_FLAG_POLLING，在内核文档中有如下解释，表示cpu是进入忙等，而不是真真意义上的idle状态

U:\linux-5.10.61\Documentation\driver-api\pm\cpuidle.rst

:c:member:`flags`

Flags representing idle state properties. Currently, governors only use

the ``CPUIDLE_FLAG_POLLING`` flag which is set if the given object

does not represent a real idle state, but an interface to a software

"loop" that can be used in order to avoid asking the processor to enter

any idle state at all. [There are other flags used by the ``CPUIdle``

core in special situations.]

10.2 poll_idle - polling模式的C state的进入方式

static int __cpuidle poll_idle(

struct cpuidle_device *dev,

struct cpuidle_driver *drv,

int index) //即将进入的C state

{

u64 time_start = local_clock();

dev->poll_time_limit = false;

local_irq_enable();

//1.对current打上TIF_POLLING_NRFLAG标志，实际上，只有idle线程才会被打上这个标志

// 并判断当前线程是否需要被调度，也就是判断TIF_NEED_RESCHED标志，如果不需要被调度

// 的话，则进入下面的if分支，进入polling

if (!current_set_polling_and_test()) {

unsigned int loop_count = 0;

u64 limit;

//2.所有C state中的"最小滞留时间"的最小值

limit = cpuidle_poll_time(drv, dev);

//3.只要current不需要被调度出去，就继续执行下面的cpu_relax操作

while (!need_resched()) {

cpu_relax();

//4.每执行200次cpu_relax操作，执行一次下面的判断

if (loop_count++ < POLL_IDLE_RELAX_COUNT)

continue;

loop_count = 0;

//5.执行cpu_relax实际上就是在polling

if (local_clock() - time_start > limit) {

dev->poll_time_limit = true;

break;

}

//6.清除current的TIF_POLLING_NRFLAG标志

current_clr_polling();

//7.polling也是一种state吗？？？？

return index;

}

10.3 cpuidle_poll_time - 所有C state中的"最小滞留时间"的最小值

/**

* cpuidle_poll_time - return amount of time to poll for,

* governors can override dev->poll_limit_ns if necessary

* @drv: the cpuidle driver tied with the cpu

* @dev: the cpuidle device

u64 cpuidle_poll_time(

struct cpuidle_driver *drv,

struct cpuidle_device *dev)

{

int i;

u64 limit_ns;

if (dev->poll_limit_ns)

return dev->poll_limit_ns;

limit_ns = TICK_NSEC;

//1.找出一个睡眠最浅的C state对应的"最小滞留时间"

// 注意：这里是从1开始遍历，（因为0是normal state），也就是从睡眠最浅

// 的那一级开始遍历，睡眠越深，"最小滞留时间"就越大，所以下面for

// 循环中，只要找到一个avaliable的C state就退出

for (i = 1; i < drv->state_count; i++) {

if (dev->states_usage[i].disable)

continue;

limit_ns = drv->states[i].target_residency_ns;

break;

}

//2.记录这个cpu支持的所有C state中，"最小滞留时间"的最小值

// 也就是睡眠最浅的那个C state的"最小滞留时间"

dev->poll_limit_ns = limit_ns;

return dev->poll_limit_ns;

}

十一、coupled机制（待补充）

耦合的cpuidle state

在某些SoC芯片中，基于下面两个原因，导致不能对某个cpu执行单独的上下电操作：

cpu的下电顺序有严格要求，例如OMAP4460-Tegra2中，cpu0必须最后一个断电
芯片在设计阶段引入的bug，例如OMAP4460中，对cpu上电将会影响到gic的电源，除非由其他的cpu正在运行

每个cpu都有一个独立的电源状态和多个"耦合电源状态"，独立的电源状态可以在不与其他cpu协调的情况下进入（例如通过WFI指令），而"耦合电源状态"的进入和退出会影响多个cpu之间的"耦合单元"（例如L2 Cache，GIC中断控制器，有时甚至会影响整个SoC），所有在进入和退出这些"耦合电源状态"时，必须受到这些耦合cpu的共同控制

coupled机制就是提供了一种解决方案：先将每个cpu将进入WFI状态中等待，直到所有耦合cpu都准备好进入耦合状态后，所有cpu再在同一时间回调函数，一起进入这个"耦合电源状态"

一旦所有的cpu都准备好进入idle状态，它们就会被一个smp交叉调用唤醒。此时cpu有最后一次机会，检查是否真的无事可做，是否真的要进入idle状态了。为了保证所有cpu同时调用power state enter回调函数，需要进行最后一次传递。在此过程中，每个cpu将增加ready count计数器，并在ready计数器等于耦合cpu的数量时后继续增加。这期间如果有任何cpu退出idle，其他cpu将减少其计数器并重试

requested_state存储每个CPU准备的最深的"耦合状态"，假设状态索引从最浅（最高功耗，最低退出延迟）到最深（最低功耗，最高退出延迟）。requested_state变量没有被锁定，它只从它存储的cpu中写入（或者如果cpu离线，则由处于在线/离线状态的cpu写入），并且只有在所有cpu都为耦合的空闲状态准备好之后才会读取

该机制使用了三个原子计数器：

alive_count用于跟踪这个"耦合集"中当前或即将上线的cpu数量
waiting_count用于跟踪处于waiting loop、ready loop或coupled idle state的cpu数量
ready_count用于跟踪处于ready loop或coupled idle state的cpu数量

要使用coupled cpuidle states，一个cpuidle驱动必须由如下支持：

在cpuidle_device->coupled_cpus记录所有存在耦合关系的cpu；
当cpu转为非耦合状态时，设置cpuidle_device->safe_state
在影响多个cpu的cpuidle_state的flags中标记CPUIDLE_FLAG_COUPLED
提供一个cpuidle_state->enter回调函数，这个函数需要保证在所有的cpu上几乎同时被调用，如果任何一个cpu试图在函数被调用后中止，驱动程序应该确保所有的cpu一起中止，函数返回时仍应禁用中断

下面注释来源于U:\linux-5.10.61\drivers\cpuidle\coupled.c

/**

* DOC: Coupled cpuidle states

* On some ARM SMP SoCs (OMAP4460, Tegra 2, and probably more), the

* cpus cannot be independently powered down, either due to

* sequencing restrictions (on Tegra 2, cpu 0 must be the last to

* power down), or due to HW bugs (on OMAP4460, a cpu powering up

* will corrupt the gic state unless the other cpu runs a work

* around). Each cpu has a power state that it can enter without

* coordinating with the other cpu (usually Wait For Interrupt, or

* WFI), and one or more "coupled" power states that affect blocks

* shared between the cpus (L2 cache, interrupt controller, and

* sometimes the whole SoC). Entering a coupled power state must

* be tightly controlled on both cpus.

* This file implements a solution, where each cpu will wait in the

* WFI state until all cpus are ready to enter a coupled state, at

* which point the coupled state function will be called on all

* cpus at approximately the same time.

* Once all cpus are ready to enter idle, they are woken by an smp

* cross call. At this point, there is a chance that one of the

* cpus will find work to do, and choose not to enter idle. A

* final pass is needed to guarantee that all cpus will call the

* power state enter function at the same time. During this pass,

* each cpu will increment the ready counter, and continue once the

* ready counter matches the number of online coupled cpus. If any

* cpu exits idle, the other cpus will decrement their counter and

* retry.

* requested_state stores the deepest coupled idle state each cpu

* is ready for. It is assumed that the states are indexed from

* shallowest (highest power, lowest exit latency) to deepest

* (lowest power, highest exit latency). The requested_state

* variable is not locked. It is only written from the cpu that

* it stores (or by the on/offlining cpu if that cpu is offline),

* and only read after all the cpus are ready for the coupled idle

* state are are no longer updating it.

* Three atomic counters are used. alive_count tracks the number

* of cpus in the coupled set that are currently or soon will be

* online. waiting_count tracks the number of cpus that are in

* the waiting loop, in the ready loop, or in the coupled idle state.

* ready_count tracks the number of cpus that are in the ready loop

* or in the coupled idle state.

* To use coupled cpuidle states, a cpuidle driver must:

* Set struct cpuidle_device.coupled_cpus to the mask of all

* coupled cpus, usually the same as cpu_possible_mask if all cpus

* are part of the same cluster. The coupled_cpus mask must be

* set in the struct cpuidle_device for each cpu.

* Set struct cpuidle_device.safe_state to a state that is not a

* coupled state. This is usually WFI.

* Set CPUIDLE_FLAG_COUPLED in struct cpuidle_state.flags for each

* state that affects multiple cpus.

* Provide a struct cpuidle_state.enter function for each state

* that affects multiple cpus. This function is guaranteed to be

* called on all cpus at approximately the same time. The driver

* should ensure that the cpus all abort together if any cpu tries

* to abort once the function is called. The function should return

* with interrupts still disabled.

11.1 cpuidle_coupled_register_device

/**

* cpuidle_coupled_register_device - register a coupled cpuidle device

* @dev: struct cpuidle_device for the current cpu

* Called from cpuidle_register_device to handle coupled idle init. Finds the

* cpuidle_coupled struct for this set of coupled cpus, or creates one if none

* exists yet.

int cpuidle_coupled_register_device(struct cpuidle_device *dev)

{

int cpu;

struct cpuidle_device *other_dev;

call_single_data_t *csd;

struct cpuidle_coupled *coupled;

//1.dev->coupled_cpus难道是记录着有哪些cpu共享该结构吗

if (cpumask_empty(&dev->coupled_cpus))

return 0;

//2.遍历所有coupled_cpus

for_each_cpu(cpu, &dev->coupled_cpus) {

other_dev = per_cpu(cpuidle_devices, cpu);

if (other_dev && other_dev->coupled) {

coupled = other_dev->coupled;

goto have_coupled;

}

/* No existing coupled info found, create a new one */

//初始化

coupled = kzalloc(sizeof(struct cpuidle_coupled), GFP_KERNEL);

if (!coupled)

return -ENOMEM;

coupled->coupled_cpus = dev->coupled_cpus;

have_coupled:

dev->coupled = coupled;

if (WARN_ON(!cpumask_equal(&dev->coupled_cpus, &coupled->coupled_cpus)))

coupled->prevent++;

cpuidle_coupled_update_online_cpus(coupled);

coupled->refcnt++;

csd = &per_cpu(cpuidle_coupled_poke_cb, dev->cpu);

csd->func = cpuidle_coupled_handle_poke;

csd->info = (void *)(unsigned long)dev->cpu;

return 0;

}

11.2 cpuidle_coupled_update_online_cpus

static void cpuidle_coupled_update_online_cpus(struct cpuidle_coupled *coupled)

{

cpumask_t cpus;

cpumask_and(&cpus, cpu_online_mask, &coupled->coupled_cpus);

coupled->online_count = cpumask_weight(&cpus);

}

11.3 cpuidle_coupled_handle_poke

static void cpuidle_coupled_handle_poke(void *info)

{

int cpu = (unsigned long)info;

cpumask_set_cpu(cpu, &cpuidle_coupled_poked);

cpumask_clear_cpu(cpu, &cpuidle_coupled_poke_pending);

}

11.4 cpuidle_state_is_coupled - 判断这个C state是否在多个cpu上起作用

判断这个C state是否被多个cpu共用

/**

* cpuidle_state_is_coupled - check if a state is part of a coupled set

* @drv: struct cpuidle_driver for the platform

* @state: index of the target state in drv->states

* Returns true if the target state is coupled with cpus besides this one

bool cpuidle_state_is_coupled(struct cpuidle_driver *drv, int state)

{

return drv->states[state].flags & CPUIDLE_FLAG_COUPLED;

}

关注公众号不迷路：DumpStack

扫码加关注

本作品采用知识共享署名-非商业性使用 4.0 国际许可协议进行许可