Linux电源管理——PSCI初始化流程和多核启动流程

一、PSCI 初始化流程

1、PSCI设备树节点

2、PSCI kernel初始化流程

get_set_conduit_method

set_conduit

psci_probe

二、CPU PSCI 操作初始化流程

1、CPU 设备树节点

2、 struct cpu_operations

3、kernel 流程

cpu_read_bootcpu_ops

smp_init_cpus

三、CPU PSCI多核启动流程

1、boot cpu 启动流程

2、secondary CPU 启动流程

QEMU Version：qemu-7.2.0

Linux Version：linux-5.4.239

本文主要分析了在ARM64架构中的PSCI电源管理接口在Linux内核中的实现流程，并分析了linux系统中如何通过 PSCI 接口启动 CPUs。

一、PSCI 初始化流程

PSCI(Power State Coordination Interface)，是由ARM定义的电源管理接口规范，Linux系统可以通过smc/hvc指令来进入不同的Exception Level，而调用对应的实现函数，下面将对 PSCI 设备树和 linux kernel 源码中 PSCI 的初始化流程进行简单分析。

1、PSCI设备树节点

psci {migrate = <0xc4000005>;cpu_on = <0xc4000003>;cpu_off = <0x84000002>;cpu_suspend = <0xc4000001>;method = "hvc";compatible = "arm,psci-1.0\0arm,psci-0.2\0arm,psci";
};

migrate：定义 CPU 任务迁移操作的函数入口地址。
cpu_on：定义启动 CPU 的函数入口地址。
cpu_off：定义关闭 CPU 的函数入口地址。
cpu_suspend：定义 CPU 挂起操作的函数入口地址，以将 CPU 置于低功耗。
method：指定调用 PSCI 函数的方式，hvc 表示通过 Hypervisor Call 指令触发 PSCI 操作，这里还有 smc（Secure Monitor Call）以陷入 ATF 。
compatible：声明设备树节点兼容的 PSCI 规范版本，优先匹配最旧的版本，即 arm,psci-0.2

2、PSCI kernel初始化流程

从 start_kernel 函数开始分析，如下：

start_kernel                           init/main.c

        setup_arch                    arch/arm64/kernel/setup.c

                setup_machine_fdt(__fdt_pointer)

                psci_dt_init

在 setup_arch 函数中首先会调用 setup_machine_fdt 函数解析设备树，其中参数__fdt_pointer是从 arch/arm64/kernel/head.S 文件中传过来的，如下：

/** The following fragment of code is executed with the MMU enabled.**   x0 = __PHYS_OFFSET*/
__primary_switched:adrp	x4, init_thread_unionadd	sp, x4, #THREAD_SIZEadr_l	x5, init_taskmsr	sp_el0, x5			// Save thread_infoadr_l	x8, vectors			// load VBAR_EL1 with virtualmsr	vbar_el1, x8			// vector table addressisbstp	xzr, x30, [sp, #-16]!mov	x29, spstr_l	x21, __fdt_pointer, x5		// Save FDT pointerldr_l	x4, kimage_vaddr		// Save the offset betweensub	x4, x4, x0			// the kernel virtual andstr_l	x4, kimage_voffset, x5		// physical mappings// Clear BSSadr_l	x0, __bss_startmov	x1, xzradr_l	x2, __bss_stopsub	x2, x2, x0bl	__pi_memsetdsb	ishst				// Make zero page visible to PTW#ifdef CONFIG_KASANbl	kasan_early_init
#endif
#ifdef CONFIG_RANDOMIZE_BASEtst	x23, ~(MIN_KIMG_ALIGN - 1)	// already running randomized?b.ne	0fmov	x0, x21				// pass FDT address in x0bl	kaslr_early_init		// parse FDT for KASLR optionscbz	x0, 0f				// KASLR disabled? just proceedorr	x23, x23, x0			// record KASLR offsetldp	x29, x30, [sp], #16		// we must enable KASLR, returnret					// to __primary_switch()
0:
#endifadd	sp, sp, #16mov	x29, #0mov	x30, #0b	start_kernel
ENDPROC(__primary_switched)

在解析完设备树之后会调用 psci_dt_init 函数初始化 PSCI，如下：

// drivers/firmware/psci/psci.ctypedef int (*psci_initcall_t)(const struct device_node *);static const struct of_device_id psci_of_match[] __initconst = {{ .compatible = "arm,psci",	.data = psci_0_1_init},{ .compatible = "arm,psci-0.2",	.data = psci_0_2_init},{ .compatible = "arm,psci-1.0",	.data = psci_1_0_init},{},
};int __init psci_dt_init(void)
{struct device_node *np;const struct of_device_id *matched_np;psci_initcall_t init_fn;int ret;np = of_find_matching_node_and_match(NULL, psci_of_match, &matched_np);if (!np || !of_device_is_available(np))return -ENODEV;init_fn = (psci_initcall_t)matched_np->data;ret = init_fn(np);of_node_put(np);return ret;
}

of_find_matching_node_and_match 函数用于在设备树中查找与指定 psci_of_match 匹配表兼容的节点并初始化matched_np，并且这里会优先使用“arm,psci-1.0” 字段进行匹配并向后兼容。

在初始化完 matched_np 之后再初始化 init_fn 为 matched_np->data，也即是 init_fn = psci_1_0_init（compatible = "arm,psci-1.0"）。

最后调用 init_fn 即 psci_1_0_init，如下：

static int __init psci_1_0_init(struct device_node *np)
{int err;err = psci_0_2_init(np);if (err)return err;if (psci_has_osi_support())pr_info("OSI mode supported.\n");return 0;
}

再进入 psci_0_2_init 函数，如下：

/** PSCI init function for PSCI versions >=0.2** Probe based on PSCI PSCI_VERSION function*/
static int __init psci_0_2_init(struct device_node *np)
{int err;err = get_set_conduit_method(np);if (err)return err;/** Starting with v0.2, the PSCI specification introduced a call* (PSCI_VERSION) that allows probing the firmware version, so* that PSCI function IDs and version specific initialization* can be carried out according to the specific version reported* by firmware*/return psci_probe();
}

psci_0_2_init 函数就是用来初始化 PSCI 相关函数的，并且PSCI版本需要大于等于0.2(PSCI < v0.2 调用 psci_0_1_init) 如下：

get_set_conduit_method

static int get_set_conduit_method(struct device_node *np)
{const char *method;pr_info("probing for conduit method from DT.\n");if (of_property_read_string(np, "method", &method)) {pr_warn("missing \"method\" property\n");return -ENXIO;}if (!strcmp("hvc", method)) {set_conduit(PSCI_CONDUIT_HVC);} else if (!strcmp("smc", method)) {set_conduit(PSCI_CONDUIT_SMC);} else {pr_warn("invalid \"method\" property: %s\n", method);return -EINVAL;}return 0;
}

该函数首先会解析PSCI设备树节点 np 中的 method 属性，以确定通信指令，如果需要陷入到 hypervisor 则为 hvc，否则如果需要陷入到 ATF，则为 smc，具体需要看PSCI节点的配置。

set_conduit

typedef unsigned long (psci_fn)(unsigned long, unsigned long,unsigned long, unsigned long);
static psci_fn *invoke_psci_fn;static unsigned long __invoke_psci_fn_hvc(unsigned long function_id,unsigned long arg0, unsigned long arg1,unsigned long arg2)
{struct arm_smccc_res res;arm_smccc_hvc(function_id, arg0, arg1, arg2, 0, 0, 0, 0, &res);return res.a0;
}static unsigned long __invoke_psci_fn_smc(unsigned long function_id,unsigned long arg0, unsigned long arg1,unsigned long arg2)
{struct arm_smccc_res res;arm_smccc_smc(function_id, arg0, arg1, arg2, 0, 0, 0, 0, &res);return res.a0;
}static void set_conduit(enum psci_conduit conduit)
{switch (conduit) {case PSCI_CONDUIT_HVC:invoke_psci_fn = __invoke_psci_fn_hvc;break;case PSCI_CONDUIT_SMC:invoke_psci_fn = __invoke_psci_fn_smc;break;default:WARN(1, "Unexpected PSCI conduit %d\n", conduit);}psci_ops.conduit = conduit;
}

set_conduit 函数主要是根据 conduit 的不同而对函数指针 invoke_psci_fn 进行初始化，以方便后面调用。

psci_probe

在分析psci_probe函数之前先看一个PSCI结构体,如下：

struct psci_operations {u32 (*get_version)(void);int (*cpu_suspend)(u32 state, unsigned long entry_point);int (*cpu_off)(u32 state);int (*cpu_on)(unsigned long cpuid, unsigned long entry_point);int (*migrate)(unsigned long cpuid);int (*affinity_info)(unsigned long target_affinity,unsigned long lowest_affinity_level);int (*migrate_info_type)(void);enum psci_conduit conduit;enum smccc_version smccc_version;
};

struct psci_operations 是 Linux 内核中用于抽象 PSCI 接口的核心数据结构，里面定义了操作系统与 Hypervisor/ATF 之间交互的电源管理函数集，如下：

get_version：返回 PSCI 版本号

cpu_suspend：将 CPU 置于指定低功耗状态（state），并在唤醒时跳转到 entry_point

cpu_off：关闭CPU

cpu_on：开启指定 CPU（cpuid），并设置其启动地址为 entry_point

migrate：将当前任务迁移到指定 CPU（cpuid）

affinity_info：查询 CPU 拓扑的亲和性信息

migrate_info_type：返回迁移信息的类型编码

conduit：PSCI_CONDUIT_HVC 或 PSCI_CONDUIT_SMC

smccc_version：SMCCC_VERSION_1_0 或 SMCCC_VERSION_1_1

通过 get_set_conduit_method 函数设置好PSCI通信方法之后，则调用 psci_probe 函数初始化对应版本的电源管理接口，如下：

/** Probe function for PSCI firmware versions >= 0.2*/
static int __init psci_probe(void)
{u32 ver = psci_get_version();pr_info("PSCIv%d.%d detected in firmware.\n",PSCI_VERSION_MAJOR(ver),PSCI_VERSION_MINOR(ver));if (PSCI_VERSION_MAJOR(ver) == 0 && PSCI_VERSION_MINOR(ver) < 2) {pr_err("Conflicting PSCI version detected.\n");return -EINVAL;}psci_0_2_set_functions();psci_init_migrate();if (PSCI_VERSION_MAJOR(ver) >= 1) {psci_init_smccc();psci_init_cpu_suspend();psci_init_system_suspend();psci_init_system_reset2();}return 0;
}

函数首先获取 PSCI的版本号，看是否符合大于等于 0.2 的标准，如果符合则调用 psci_0_2_set_functions 函数，如下：

static void __init psci_0_2_set_functions(void)
{pr_info("Using standard PSCI v0.2 function IDs\n");psci_ops.get_version = psci_get_version;psci_function_id[PSCI_FN_CPU_SUSPEND] =PSCI_FN_NATIVE(0_2, CPU_SUSPEND);psci_ops.cpu_suspend = psci_cpu_suspend;psci_function_id[PSCI_FN_CPU_OFF] = PSCI_0_2_FN_CPU_OFF;psci_ops.cpu_off = psci_cpu_off;psci_function_id[PSCI_FN_CPU_ON] = PSCI_FN_NATIVE(0_2, CPU_ON);psci_ops.cpu_on = psci_cpu_on;psci_function_id[PSCI_FN_MIGRATE] = PSCI_FN_NATIVE(0_2, MIGRATE);psci_ops.migrate = psci_migrate;psci_ops.affinity_info = psci_affinity_info;psci_ops.migrate_info_type = psci_migrate_info_type;arm_pm_restart = psci_sys_reset;pm_power_off = psci_sys_poweroff;
}

可以看到 psci_0_2_set_functions 函数其实就是对 psci_operations 进行初始化，设置相应的回调函数，比如当操作系统需要hotplug时，就会调用到 psci_cpu_off/psci_cpu_on 函数，到此为止 PSCI 的初始化就完成，下面将介绍对应 CPU 的相关初始化。

二、CPU PSCI 操作初始化流程

1、CPU 设备树节点

cpus {#size-cells = <0x00>;#address-cells = <0x01>;......cpu@0 {phandle = <0x8004>;reg = <0x00>;enable-method = "psci";compatible = "arm,cortex-a53";device_type = "cpu";};cpu@1 {phandle = <0x8003>;reg = <0x01>;enable-method = "psci";compatible = "arm,cortex-a53";device_type = "cpu";};......
};

这里只关注“enable-method”字段，该字段就描述了该 CPU 的启动方式，这里是启动方式是 “PSCI”，还有一种“spin-table”启动方式这里将不再说明。

2、 struct cpu_operations

/*** struct cpu_operations - Callback operations for hotplugging CPUs.** @name:	Name of the property as appears in a devicetree cpu node's*		enable-method property. On systems booting with ACPI, @name*		identifies the struct cpu_operations entry corresponding to*		the boot protocol specified in the ACPI MADT table.* @cpu_init:	Reads any data necessary for a specific enable-method for a*		proposed logical id.* @cpu_prepare: Early one-time preparation step for a cpu. If there is a*		mechanism for doing so, tests whether it is possible to boot*		the given CPU.* @cpu_boot:	Boots a cpu into the kernel.* @cpu_postboot: Optionally, perform any post-boot cleanup or necesary*		synchronisation. Called from the cpu being booted.* @cpu_can_disable: Determines whether a CPU can be disabled based on*		mechanism-specific information.* @cpu_disable: Prepares a cpu to die. May fail for some mechanism-specific* 		reason, which will cause the hot unplug to be aborted. Called* 		from the cpu to be killed.* @cpu_die:	Makes a cpu leave the kernel. Must not fail. Called from the*		cpu being killed.* @cpu_kill:  Ensures a cpu has left the kernel. Called from another cpu.* @cpu_init_idle: Reads any data necessary to initialize CPU idle states for*		   a proposed logical id.* @cpu_suspend: Suspends a cpu and saves the required context. May fail owing*               to wrong parameters or error conditions. Called from the*               CPU being suspended. Must be called with IRQs disabled.*/
struct cpu_operations {const char	*name;int		(*cpu_init)(unsigned int);int		(*cpu_prepare)(unsigned int);int		(*cpu_boot)(unsigned int);void		(*cpu_postboot)(void);
#ifdef CONFIG_HOTPLUG_CPUbool		(*cpu_can_disable)(unsigned int cpu);int		(*cpu_disable)(unsigned int cpu);void		(*cpu_die)(unsigned int cpu);int		(*cpu_kill)(unsigned int cpu);
#endif
#ifdef CONFIG_CPU_IDLEint		(*cpu_init_idle)(unsigned int);int		(*cpu_suspend)(unsigned long);
#endif
};

struct cpu_operations 是 Linux 内核中定义 CPU hotplug 与电源管理操作的核心数据结构，为不同架构（如 ARM、x86）提供统一的接口，这也是 linux 常用的一种方法了。如下：

name：该操作集的名称，即 cpu 节点的 enable-method 属性

cpu_init：初始化指定 CPU 的特定数据（如寄存器配置）

cpu_prepare：准备启动 CPU，验证其可启动性

cpu_boot：启动 CPU，（会跳转到 PSCI_CPU_ON）

cpu_postboot：在目标 CPU 上执行启动后的清理或同步操作

cpu_can_disable：查 CPU 是否可安全禁用

cpu_disable：禁用 CPU 的中断和定时器，准备关闭

cpu_die：使 CPU 退出内核（如进入 WFI ）

cpu_kill：确认 CPU 已完全关闭（从其他 CPU 调用）

cpu_init_idle：初始化 CPU idle

cpu_suspend：suspend CPU 并保存上下文

3、kernel 流程

介绍完cpu_operations数据结构后再回到 setup_arch 函数中:

setup_arch

cpu_read_bootcpu_ops （CPU0 cpu_operations 的初始化）

smp_init_cpus （CPUx cpu_operations 的初始化）

cpu_read_bootcpu_ops

// arch/arm64/kernel/cpu_ops.c/** Read a cpu's enable method and record it in cpu_ops.*/
int __init cpu_read_ops(int cpu)
{const char *enable_method = cpu_read_enable_method(cpu);if (!enable_method)return -ENODEV;cpu_ops[cpu] = cpu_get_ops(enable_method);if (!cpu_ops[cpu]) {pr_warn("Unsupported enable-method: %s\n", enable_method);return -EOPNOTSUPP;}return 0;
}static inline void __init cpu_read_bootcpu_ops(void)
{cpu_read_ops(0);
}

通过注释就能够看出cpu_read_ops函数是用来读取CPU0的enable方法并将其记录在cpu_ops中，cpu_read_enable_method 函数就是从CPU0的设备树节点中读取“enable-method”属性值，并初始化给 enable_method 变量（通过上面 CPU0 的设备数节点知道这个值为 PSCI），因为这个函数功能比较简单，这里将不再展开分析。

再继续往下走，首先注意到一个全局的指针数组，如下：

const struct cpu_operations *cpu_ops[NR_CPUS] __ro_after_init;

这里 cpu_ops 是一个 指针数组，包含 NR_CPUS 个元素，每个元素是 const struct cpu_operations* 类型的指针，主要是用于为每个 CPU 提供独立的操作函数集。所以这里首先是对 CPU0 设置 cpu_operations 函数操作集。

cpu_get_ops 函数如下：

// arch/arm64/kernel/cpu_ops.cextern const struct cpu_operations smp_spin_table_ops;
extern const struct cpu_operations cpu_psci_ops;const struct cpu_operations *cpu_ops[NR_CPUS] __ro_after_init;static const struct cpu_operations *const dt_supported_cpu_ops[] __initconst = {&smp_spin_table_ops,&cpu_psci_ops,NULL,
};static const struct cpu_operations * __init cpu_get_ops(const char *name)
{const struct cpu_operations *const *ops;ops = acpi_disabled ? dt_supported_cpu_ops : acpi_supported_cpu_ops;while (*ops) {if (!strcmp(name, (*ops)->name))return *ops;ops++;}return NULL;
}

cpu_get_ops 函数其实就是通过 acpi_disabled 值的不同对 CPU0 的 cpu_operations 进行不同的初始化，acpi_disabled 默认是1（include/linux/acpi.h），所以设置 dt_supported_cpu_ops 为 CPU0 的操作函数集，前面 cpu_read_bootcpu_ops 函数获取了 CPU0 的 enable-method = PSCI，并传到了 cpu_get_ops 函数，所以这里通过 strcmp 函数最终确定了该 CPU0 的 cpu_operations 为 cpu_psci_ops，如下：

// arch/arm64/kernel/psci.c
static int __init cpu_psci_cpu_init(unsigned int cpu)
{return 0;
}static int __init cpu_psci_cpu_prepare(unsigned int cpu)
{if (!psci_ops.cpu_on) {pr_err("no cpu_on method, not booting CPU%d\n", cpu);return -ENODEV;}return 0;
}static int cpu_psci_cpu_boot(unsigned int cpu)
{int err = psci_ops.cpu_on(cpu_logical_map(cpu), __pa_symbol(secondary_entry));if (err)pr_err("failed to boot CPU%d (%d)\n", cpu, err);return err;
}#ifdef CONFIG_HOTPLUG_CPU
static bool cpu_psci_cpu_can_disable(unsigned int cpu)
{return !psci_tos_resident_on(cpu);
}static int cpu_psci_cpu_disable(unsigned int cpu)
{/* Fail early if we don't have CPU_OFF support */if (!psci_ops.cpu_off)return -EOPNOTSUPP;/* Trusted OS will deny CPU_OFF */if (psci_tos_resident_on(cpu))return -EPERM;return 0;
}static void cpu_psci_cpu_die(unsigned int cpu)
{/** There are no known implementations of PSCI actually using the* power state field, pass a sensible default for now.*/u32 state = PSCI_POWER_STATE_TYPE_POWER_DOWN <<PSCI_0_2_POWER_STATE_TYPE_SHIFT;psci_ops.cpu_off(state);
}static int cpu_psci_cpu_kill(unsigned int cpu)
{int err;unsigned long start, end;if (!psci_ops.affinity_info)return 0;/** cpu_kill could race with cpu_die and we can* potentially end up declaring this cpu undead* while it is dying. So, try again a few times.*/start = jiffies;end = start + msecs_to_jiffies(100);do {err = psci_ops.affinity_info(cpu_logical_map(cpu), 0);if (err == PSCI_0_2_AFFINITY_LEVEL_OFF) {pr_info("CPU%d killed (polled %d ms)\n", cpu,jiffies_to_msecs(jiffies - start));return 0;}usleep_range(100, 1000);} while (time_before(jiffies, end));pr_warn("CPU%d may not have shut down cleanly (AFFINITY_INFO reports %d)\n",cpu, err);return -ETIMEDOUT;
}
#endifconst struct cpu_operations cpu_psci_ops = {.name		= "psci",.cpu_init	= cpu_psci_cpu_init,.cpu_prepare	= cpu_psci_cpu_prepare,.cpu_boot	= cpu_psci_cpu_boot,
#ifdef CONFIG_HOTPLUG_CPU.cpu_can_disable = cpu_psci_cpu_can_disable,.cpu_disable	= cpu_psci_cpu_disable,.cpu_die	= cpu_psci_cpu_die,.cpu_kill	= cpu_psci_cpu_kill,
#endif
};

cpu_psci_ops 里面就为 CPU0 初始化了 cpu_operations 结构体，可以看到里面的函数指针其实又会调用到前面初始化的 PSCI 操作集函数，这也就是linux系统中常用的分层操作，对上提供统一接口，而对下则对应不同的硬件平台，比如启动 CPU0 时，就会调用 cpu_psci_cpu_boot 函数，进而调用 PSCI 操作集中的 psci_cpu_on 函数。

smp_init_cpus

因为现在大部分都是SMP（symmetrical mulit-processing）操作系统，所以不可能只有一个 CPU，在介绍完CPU0 的cpu_operations 初始化之后，再介绍一下 CPUx（secondary CPU）cpu_operations 的初始化。如下：

// arch/arm64/kernel/smp.c
/** Enumerate the possible CPU set from the device tree or ACPI and build the* cpu logical map array containing MPIDR values related to logical* cpus. Assumes that cpu_logical_map(0) has already been initialized.*/
void __init smp_init_cpus(void)
{int i;if (acpi_disabled)of_parse_and_init_cpus();elseacpi_parse_and_init_cpus();if (cpu_count > nr_cpu_ids)pr_warn("Number of cores (%d) exceeds configured maximum of %u - clipping\n",cpu_count, nr_cpu_ids);if (!bootcpu_valid) {pr_err("missing boot CPU MPIDR, not enabling secondaries\n");return;}/** We need to set the cpu_logical_map entries before enabling* the cpus so that cpu processor description entries (DT cpu nodes* and ACPI MADT entries) can be retrieved by matching the cpu hwid* with entries in cpu_logical_map while initializing the cpus.* If the cpu set-up fails, invalidate the cpu_logical_map entry.*/for (i = 1; i < nr_cpu_ids; i++) {if (cpu_logical_map(i) != INVALID_HWID) {if (smp_cpu_setup(i))set_cpu_logical_map(i, INVALID_HWID);}}
}

smp_init_cpus 函数是 Linux 内核中用于初始化多核处理器的核心函数，主要职责是根据硬件描述（设备树或 ACPI）遍历 CPU 标识所有可用的 CPU ，并建立映射。

这里只对 secondary CPU 的 cpu_operations 结构体进行分析也就是 smp_cpu_setup 函数，其它细节这里将不再赘述，如下：

// arch/arm64/kernel/smp.c
/** Initialize cpu operations for a logical cpu and* set it in the possible mask on success*/
static int __init smp_cpu_setup(int cpu)
{if (cpu_read_ops(cpu))return -ENODEV;if (cpu_ops[cpu]->cpu_init(cpu))return -ENODEV;set_cpu_possible(cpu, true);return 0;
}

和 CPU0 一样，这里也是调用 cpu_read_ops 函数进行初始化 cpu_operations 的，只是传入的 CPU id 不一样，所以将不再赘述。

在初始化完 cpu_operations 之后就调用了 cpu_ops[cpu]->cpu_init(cpu) 也就是 cpu_psci_cpu_init 函数，但在linux-5.4.239版本中这个函数好像没有做什么操作，如下：

// arch/arm64/kernel/psci.c
static int __init cpu_psci_cpu_init(unsigned int cpu)
{return 0;
}

smp_cpu_setup 函数最后调用 set_cpu_possible 如下：

// include/linux/cpumask.hstatic inline void
set_cpu_possible(unsigned int cpu, bool possible)
{if (possible)cpumask_set_cpu(cpu, &__cpu_possible_mask);elsecpumask_clear_cpu(cpu, &__cpu_possible_mask);
}

该函数会设置指定 CPU 的掩码位，以告知内核此 CPU 可能已经存在，反之清楚掩码位，让内核忽略此 CPU，不再为其分配任务。

还有一些类似的函数如下：

// 标记 CPU 是否物理存在
static inline void
set_cpu_present(unsigned int cpu, bool present)
{if (present)cpumask_set_cpu(cpu, &__cpu_present_mask);elsecpumask_clear_cpu(cpu, &__cpu_present_mask);
}// 标记 CPU 已在线（已启动并加入调度）需平台相关代码实现
void set_cpu_online(unsigned int cpu, bool online);// 标记 CPU 是否参与负载均衡（允许任务迁移）
static inline void
set_cpu_active(unsigned int cpu, bool active)
{if (active)cpumask_set_cpu(cpu, &__cpu_active_mask);elsecpumask_clear_cpu(cpu, &__cpu_active_mask);
}

这些函数都是用来标记 CPU 的不同状态的，以确保 Linux 内核能够高效的为电源管理、热插拔等功能提供服务。

三、CPU PSCI多核启动流程

分析到这里 PSCI 的初始化流程和CPU 的 cpu_operations 结构体初始化就完成了，那初始化的这些回调函数在什么时候被调用呢？下面将分析使用 PSCI 的 CPU 启动流程（只包含kernel部分）。

1、boot cpu 启动流程

在 secondary CPU 没有被启动之前所有的操作默认都是由 boot CPU 执行的，所以这里对 boot CPU 的启动只做简单分析。

start_kernel

        ........

        boot_cpu_init

        ........

        arch_call_rest_init

                rest_init

boot_cpu_init:

// kernel/cpu.c/** Activate the first processor.*/
void __init boot_cpu_init(void)
{int cpu = smp_processor_id();/* Mark the boot cpu "present", "online" etc for SMP and UP case */set_cpu_online(cpu, true);set_cpu_active(cpu, true);set_cpu_present(cpu, true);set_cpu_possible(cpu, true);#ifdef CONFIG_SMP__boot_cpu_id = cpu;
#endif
}

boot_cpu_init 是 Linux 内核启动引导 CPU（第一个启动的 CPU）的核心函数，确保其状态掩码被正确标记以支持后续的多核调度。

函数首先会获取执行当前代码的 CPU 逻辑 ID，因为其它 CPU 还没有启动，所以这里一般为主 CPU 的 id 并且恒为 0。

设置 CPU 状态掩码：

set_cpu_online(cpu, true);
set_cpu_active(cpu, true);
set_cpu_present(cpu, true);
set_cpu_possible(cpu, true);

set_cpu_present：标记 CPU 物理真实存在

set_cpu_possible：声明逻辑支持（受 CONFIG_NR_CPUS 限制）

set_cpu_online：启用调度（允许任务分配）

set_cpu_active：参与负载均衡（允许任务迁移）

必须按 present → possible → online → active 顺序设置 CPU 状态掩码。

最后，如果开启了 CONFIG_SMP 配置，即 SMP 系统，则将 boot CPU id 保存到 __boot_cpu_id 以方便系统需要获取 boot CPU id。

rest_init 函数如下：

// init/main.c
noinline void __ref rest_init(void)
{struct task_struct *tsk;int pid;rcu_scheduler_starting();/** We need to spawn init first so that it obtains pid 1, however* the init task will end up wanting to create kthreads, which, if* we schedule it before we create kthreadd, will OOPS.*/pid = kernel_thread(kernel_init, NULL, CLONE_FS);/** Pin init on the boot CPU. Task migration is not properly working* until sched_init_smp() has been run. It will set the allowed* CPUs for init to the non isolated CPUs.*/rcu_read_lock();tsk = find_task_by_pid_ns(pid, &init_pid_ns);set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));rcu_read_unlock();numa_default_policy();pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);rcu_read_lock();kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);rcu_read_unlock();/** Enable might_sleep() and smp_processor_id() checks.* They cannot be enabled earlier because with CONFIG_PREEMPTION=y* kernel_thread() would trigger might_sleep() splats. With* CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled* already, but it's stuck on the kthreadd_done completion.*/system_state = SYSTEM_SCHEDULING;complete(&kthreadd_done);/** The boot idle thread must execute schedule()* at least once to get things moving:*/schedule_preempt_disabled();/* Call into cpu_idle with preempt disabled */cpu_startup_entry(CPUHP_ONLINE);
}

rest_init 是 Linux 内核启动流程中的核心函数，负责初始化关键系统进程和多核调度环境，下面将对该函数进行分析，如下：

创建 Init 进程：

pid = kernel_thread(kernel_init, NULL, CLONE_FS);

kernel_init是内核线程入口函数，最终执行用户空间的 /sbin/init，并强制分配 PID 1 给 init 进程。

绑定 init 进程到启动 CPU：

tsk = find_task_by_pid_ns(pid, &init_pid_ns);
set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));

在 SMP 调度初始化（sched_init_smp）未完成时，如果出现任务迁移可能导致系统崩溃，init 进程固定在启动 CPU（CPU 0）运行，直到调度器准备就绪。

创建 kthreadd 守护进程：

pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);

创建内核线程管理器（PID 2），即所有内核线程的父进程，这里需要注意的是 kthreadd 必须准备就绪后，其他内核线程才能安全创建，即这里的 complete(&kthreadd_done)，如下：

同步与调度激活：

system_state = SYSTEM_SCHEDULING;
complete(&kthreadd_done);

system_state 用来标记系统进入可调度状态，complete(&kthreadd_done)则是通知 kernel_init 进程 kthreadd 已经就绪，可以继续往下运行。

然后再强制执行一次调度激活任务队列，并将 CPU 设置成 CPUHP_ONLINE 状态，等待被中断或任务唤醒。

schedule_preempt_disabled();
cpu_startup_entry(CPUHP_ONLINE);

这里有一个同步的问题，即需要先创建 kthreadd，再创建 init，并通过 kthreadd_done 确保 init 在 kthreadd 准备就绪后再继续往下执行，若 init 进程在 kthreadd 准备就绪前创建线程，会因无效的 PID 2 而导致崩溃。

2、secondary CPU 启动流程

前面分析了这么长的流程好像都还没有初始化 secondary CPUs，那 secondary CPUs 是在哪里被开启的呢？直接进入到 kernel_init ，如下：

kernel_init

        kernel_init_freeable

            // Wait until kthreadd is all set-up.

            wait_for_completion(&kthreadd_done);

            smp_init();

在 kernel_init_freeable 函数中会先等待 kthreadd 准备就绪之后再往下执行，进入到smp_init 函数，如下：

// kernel/smp.c/* Called by boot processor to activate the rest. */
void __init smp_init(void)
{int num_nodes, num_cpus;unsigned int cpu;idle_threads_init();cpuhp_threads_init();pr_info("Bringing up secondary CPUs ...\n");/* FIXME: This should be done in userspace --RR */for_each_present_cpu(cpu) {if (num_online_cpus() >= setup_max_cpus)break;if (!cpu_online(cpu))cpu_up(cpu);}num_nodes = num_online_nodes();num_cpus  = num_online_cpus();pr_info("Brought up %d node%s, %d CPU%s\n",num_nodes, (num_nodes > 1 ? "s" : ""),num_cpus,  (num_cpus  > 1 ? "s" : ""));/* Any cleanup work */smp_cpus_done(setup_max_cpus);
}

smp_init 函数是被 boot CPU 进行调用的，即主要功能就是启动所有可用的 secondary CPU，

首先会初始化两个线程：

idle_threads_init：为每个 CPU 创建 idle 线程，用于无任务时的低功耗等待。

cpuhp_threads_init：初始化 CPU 热插拔线程，以动态的打开/关闭 CPU。

然后就开始启动 secondary CPUs：

	pr_info("Bringing up secondary CPUs ...\n");/* FIXME: This should be done in userspace --RR */for_each_present_cpu(cpu) {if (num_online_cpus() >= setup_max_cpus)break;if (!cpu_online(cpu))cpu_up(cpu);}

遍历 CPU： for_each_present_cpu 会遍历所有由 __cpu_present_mask 标记的 CPU ，即所有物理存在的 CPU

限制数量：setup_max_cpus（内核配置参数）防止超过硬件支持的 CPU 数量

启动 CPU：如果当前CPU不在线，则调用 cpu_up(cpu) 开始 secondary CPUs 的启动流程

cpu_up 流程如下：

cpu_up                // kernel/cpu.c

        do_cpu_up(cpu, CPUHP_ONLINE);

                _cpu_up(cpu, 0, target)

                        cpuhp_up_callbacks

                                ......

这里关于 cpu_up 的后续流程可以参考：Linux电源管理——CPU Hotplug 流程

再对 nodes 和 CPU 进行统计：

num_nodes = num_online_nodes();  // 统计激活的 NUMA 节点数
num_cpus  = num_online_cpus();   // 统计已启动的 CPU 核心数

最后清理资源并告知 kernel secondary CPUs 初始化完成：

smp_cpus_done(setup_max_cpus);

到目前为止所以 boot CPU 和 secondary CPUs 都已经启动完毕，并都已经进入 idle 线程，等待执行任务。