【QEMU系统分析之启动篇（十一）】

系列文章目录

第十一章 QEMU系统仿真的加速器初始化分析

文章目录

系列文章目录
- 第十一章 QEMU系统仿真的加速器初始化分析
前言
一、QEMU是什么？
二、QEMU系统仿真的启动分析
- 1.系统仿真的初始化代码
- 2.主循环数据初始化
- - configure_accelerators()
  - phase_advance(PHASE_ACCEL_CREATED)
总结

前言

本文以 QEMU 8.2.2 为例，分析其作为系统仿真工具的启动过程，并为读者展示各种 QEMU 系统仿真的启动配置实例。
本文读者需要具备一定的 QEMU 系统仿真使用经验，并对 C 语言编程有一定了解。

一、QEMU是什么？

QEMU 是一个通用且开源的机器模拟器和虚拟机。
其官方主页是：https://www.qemu.org/

二、QEMU系统仿真的启动分析

1.系统仿真的初始化代码

QEMU 作为系统仿真工具，其入口代码在 system/main.c 文件中，初始化函数 qemu_init() 的实现在 system/vl.c 文件中，在完成 QEMU 虚拟机配置项处理和应用后，系统将配置虚拟机的加速器。本篇文章将完成以下代码部分的分析。

2.主循环数据初始化

这部分代码在 system/vl.c 文件中，实现如下：

void qemu_init(int argc, char **argv)
{
.../** Note: uses machine properties such as kernel-irqchip, must run* after qemu_apply_machine_options.*/configure_accelerators(argv[0]);phase_advance(PHASE_ACCEL_CREATED);
...
}

代码首先通过函数 qemu_disable_default_devices() 将所有默认设备的设置复位。

configure_accelerators()

此函数在 /system/vl.c 文件中，定义如下：

static void configure_accelerators(const char *progname)
{bool init_failed = false;qemu_opts_foreach(qemu_find_opts("icount"),do_configure_icount, NULL, &error_fatal);if (QTAILQ_EMPTY(&qemu_accel_opts.head)) {char **accel_list, **tmp;if (accelerators == NULL) {/* Select the default accelerator */bool have_tcg = accel_find("tcg");bool have_kvm = accel_find("kvm");if (have_tcg && have_kvm) {if (g_str_has_suffix(progname, "kvm")) {/* If the program name ends with "kvm", we prefer KVM */accelerators = "kvm:tcg";} else {accelerators = "tcg:kvm";}} else if (have_kvm) {accelerators = "kvm";} else if (have_tcg) {accelerators = "tcg";} else {error_report("No accelerator selected and"" no default accelerator available");exit(1);}}accel_list = g_strsplit(accelerators, ":", 0);for (tmp = accel_list; *tmp; tmp++) {/** Filter invalid accelerators here, to prevent obscenities* such as "-machine accel=tcg,,thread=single".*/if (accel_find(*tmp)) {qemu_opts_parse_noisily(qemu_find_opts("accel"), *tmp, true);} else {init_failed = true;error_report("invalid accelerator %s", *tmp);}}g_strfreev(accel_list);} else {if (accelerators != NULL) {error_report("The -accel and \"-machine accel=\" options are incompatible");exit(1);}}if (!qemu_opts_foreach(qemu_find_opts("accel"),do_configure_accelerator, &init_failed, &error_fatal)) {if (!init_failed) {error_report("no accelerator found");}exit(1);}if (init_failed && !qtest_chrdev) {error_report("falling back to %s", current_accel_name());}if (icount_enabled() && !tcg_enabled()) {error_report("-icount is not allowed with hardware virtualization");exit(1);}
}

、
函数 do_configure_icount() 定义如下：

static int do_configure_icount(void *opaque, QemuOpts *opts, Error **errp)
{icount_configure(opts, errp);return 0;
}

而函数 icount_configure() 在 TCG 加速器中定义如下：

void icount_configure(QemuOpts *opts, Error **errp)
{const char *option = qemu_opt_get(opts, "shift");bool sleep = qemu_opt_get_bool(opts, "sleep", true);bool align = qemu_opt_get_bool(opts, "align", false);long time_shift = -1;if (!option) {if (qemu_opt_get(opts, "align") != NULL) {error_setg(errp, "Please specify shift option when using align");}return;}if (align && !sleep) {error_setg(errp, "align=on and sleep=off are incompatible");return;}if (strcmp(option, "auto") != 0) {if (qemu_strtol(option, NULL, 0, &time_shift) < 0|| time_shift < 0 || time_shift > MAX_ICOUNT_SHIFT) {error_setg(errp, "icount: Invalid shift value");return;}} else if (icount_align_option) {error_setg(errp, "shift=auto and align=on are incompatible");return;} else if (!icount_sleep) {error_setg(errp, "shift=auto and sleep=off are incompatible");return;}icount_sleep = sleep;if (icount_sleep) {timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,icount_timer_cb, NULL);}icount_align_option = align;if (time_shift >= 0) {timers_state.icount_time_shift = time_shift;icount_enable_precise();return;}icount_enable_adaptive();/** 125MIPS seems a reasonable initial guess at the guest speed.* It will be corrected fairly quickly anyway.*/timers_state.icount_time_shift = 3;/** Have both realtime and virtual time triggers for speed adjustment.* The realtime trigger catches emulated time passing too slowly,* the virtual time trigger catches emulated time passing too fast.* Realtime triggers occur even when idle, so use them less frequently* than VM triggers.*/timers_state.vm_clock_warp_start = -1;timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,icount_adjust_rt, NULL);timer_mod(timers_state.icount_rt_timer,qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,icount_adjust_vm, NULL);timer_mod(timers_state.icount_vm_timer,qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +NANOSECONDS_PER_SECOND / 10);
}

如果没有设定加速器，默认查找是否支持 TCG 或 KVM 加速器，如果有则配置默认加速器，实现代码如下：

        if (accelerators == NULL) {/* Select the default accelerator */bool have_tcg = accel_find("tcg");bool have_kvm = accel_find("kvm");if (have_tcg && have_kvm) {if (g_str_has_suffix(progname, "kvm")) {/* If the program name ends with "kvm", we prefer KVM */accelerators = "kvm:tcg";} else {accelerators = "tcg:kvm";}} else if (have_kvm) {accelerators = "kvm";} else if (have_tcg) {accelerators = "tcg";} else {error_report("No accelerator selected and"" no default accelerator available");exit(1);}}

函数 do_configure_accelerator() 定义如下：

static int do_configure_accelerator(void *opaque, QemuOpts *opts, Error **errp)
{bool *p_init_failed = opaque;const char *acc = qemu_opt_get(opts, "accel");AccelClass *ac = accel_find(acc);AccelState *accel;int ret;bool qtest_with_kvm;if (!acc) {error_setg(errp, QERR_MISSING_PARAMETER, "accel");goto bad;}qtest_with_kvm = g_str_equal(acc, "kvm") && qtest_chrdev != NULL;if (!ac) {if (!qtest_with_kvm) {error_report("invalid accelerator %s", acc);}goto bad;}accel = ACCEL(object_new_with_class(OBJECT_CLASS(ac)));object_apply_compat_props(OBJECT(accel));qemu_opt_foreach(opts, accelerator_set_property,accel,&error_fatal);/** If legacy -singlestep option is set, honour it for TCG and* silently ignore for any other accelerator (which is how this* option has always behaved).*/if (opt_one_insn_per_tb) {/** This will always succeed for TCG, and we want to ignore* the error from trying to set a nonexistent property* on any other accelerator.*/object_property_set_bool(OBJECT(accel), "one-insn-per-tb", true, NULL);}ret = accel_init_machine(accel, current_machine);if (ret < 0) {if (!qtest_with_kvm || ret != -ENOENT) {error_report("failed to initialize %s: %s", acc, strerror(-ret));}goto bad;}return 1;bad:*p_init_failed = true;return 0;
}

函数 accel_find() 定义如下：

/* Lookup AccelClass from opt_name. Returns NULL if not found */
AccelClass *accel_find(const char *opt_name)
{char *class_name = g_strdup_printf(ACCEL_CLASS_NAME("%s"), opt_name);AccelClass *ac = ACCEL_CLASS(module_object_class_by_name(class_name));g_free(class_name);return ac;
}

数据结构 struct AccelClass 定义如下：

struct AccelState {/*< private >*/Object parent_obj;
};typedef struct AccelClass {/*< private >*/ObjectClass parent_class;/*< public >*/const char *name;int (*init_machine)(MachineState *ms);
#ifndef CONFIG_USER_ONLYvoid (*setup_post)(MachineState *ms, AccelState *accel);bool (*has_memory)(MachineState *ms, AddressSpace *as,hwaddr start_addr, hwaddr size);
#endifbool (*cpu_common_realize)(CPUState *cpu, Error **errp);void (*cpu_common_unrealize)(CPUState *cpu);/* gdbstub related hooks */int (*gdbstub_supported_sstep_flags)(void);bool *allowed;/** Array of global properties that would be applied when specific* accelerator is chosen. It works like MachineClass.compat_props* but it's for accelerators not machines. Accelerator-provided* global properties may be overridden by machine-type* compat_props or user-provided global properties.*/GPtrArray *compat_props;
} AccelClass;

函数 accel_init_machine() 定义如下：

int accel_init_machine(AccelState *accel, MachineState *ms)
{AccelClass *acc = ACCEL_GET_CLASS(accel);int ret;ms->accelerator = accel;*(acc->allowed) = true;ret = acc->init_machine(ms);if (ret < 0) {ms->accelerator = NULL;*(acc->allowed) = false;object_unref(OBJECT(accel));} else {object_set_accelerator_compat_props(acc->compat_props);}return ret;
}

通过 accel_init_machine() 将目标机器和加速器关联在一起。

加速器的 init_machine() 函数根据每个加速器不同，有不同的实现，WHPX 的定义如下：

/** Partition support*/static int whpx_accel_init(MachineState *ms)
{struct whpx_state *whpx;int ret;HRESULT hr;WHV_CAPABILITY whpx_cap;UINT32 whpx_cap_size;WHV_PARTITION_PROPERTY prop;UINT32 cpuidExitList[] = {1, 0x80000001};WHV_CAPABILITY_FEATURES features = {0};whpx = &whpx_global;if (!init_whp_dispatch()) {ret = -ENOSYS;goto error;}whpx->mem_quota = ms->ram_size;hr = whp_dispatch.WHvGetCapability(WHvCapabilityCodeHypervisorPresent, &whpx_cap,sizeof(whpx_cap), &whpx_cap_size);if (FAILED(hr) || !whpx_cap.HypervisorPresent) {error_report("WHPX: No accelerator found, hr=%08lx", hr);ret = -ENOSPC;goto error;}hr = whp_dispatch.WHvGetCapability(WHvCapabilityCodeFeatures, &features, sizeof(features), NULL);if (FAILED(hr)) {error_report("WHPX: Failed to query capabilities, hr=%08lx", hr);ret = -EINVAL;goto error;}hr = whp_dispatch.WHvCreatePartition(&whpx->partition);if (FAILED(hr)) {error_report("WHPX: Failed to create partition, hr=%08lx", hr);ret = -EINVAL;goto error;}/** Query the XSAVE capability of the partition. Any error here is not* considered fatal.*/hr = whp_dispatch.WHvGetPartitionProperty(whpx->partition,WHvPartitionPropertyCodeProcessorXsaveFeatures,&whpx_xsave_cap,sizeof(whpx_xsave_cap),&whpx_cap_size);/** Windows version which don't support this property will return with the* specific error code.*/if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) {error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr);}if (!whpx_has_xsave()) {printf("WHPX: Partition is not XSAVE capable\n");}memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));prop.ProcessorCount = ms->smp.cpus;hr = whp_dispatch.WHvSetPartitionProperty(whpx->partition,WHvPartitionPropertyCodeProcessorCount,&prop,sizeof(WHV_PARTITION_PROPERTY));if (FAILED(hr)) {error_report("WHPX: Failed to set partition processor count to %u,"" hr=%08lx", prop.ProcessorCount, hr);ret = -EINVAL;goto error;}/** Error out if WHP doesn't support apic emulation and user is requiring* it.*/if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation ||!whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) {error_report("WHPX: kernel irqchip requested, but unavailable. ""Try without kernel-irqchip or with kernel-irqchip=off");ret = -EINVAL;goto error;}if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation &&whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) {WHV_X64_LOCAL_APIC_EMULATION_MODE mode =WHvX64LocalApicEmulationModeXApic;printf("WHPX: setting APIC emulation mode in the hypervisor\n");hr = whp_dispatch.WHvSetPartitionProperty(whpx->partition,WHvPartitionPropertyCodeLocalApicEmulationMode,&mode,sizeof(mode));if (FAILED(hr)) {error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr);if (whpx->kernel_irqchip_required) {error_report("WHPX: kernel irqchip requested, but unavailable");ret = -EINVAL;goto error;}} else {whpx->apic_in_platform = true;}}/* Register for MSR and CPUID exits */memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));prop.ExtendedVmExits.X64MsrExit = 1;prop.ExtendedVmExits.X64CpuidExit = 1;prop.ExtendedVmExits.ExceptionExit = 1;if (whpx_apic_in_platform()) {prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1;}hr = whp_dispatch.WHvSetPartitionProperty(whpx->partition,WHvPartitionPropertyCodeExtendedVmExits,&prop,sizeof(WHV_PARTITION_PROPERTY));if (FAILED(hr)) {error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr);ret = -EINVAL;goto error;}hr = whp_dispatch.WHvSetPartitionProperty(whpx->partition,WHvPartitionPropertyCodeCpuidExitList,cpuidExitList,RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));if (FAILED(hr)) {error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",hr);ret = -EINVAL;goto error;}/** We do not want to intercept any exceptions from the guest,* until we actually start debugging with gdb.*/whpx->exception_exit_bitmap = -1;hr = whpx_set_exception_exit_bitmap(0);if (FAILED(hr)) {error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr);ret = -EINVAL;goto error;}hr = whp_dispatch.WHvSetupPartition(whpx->partition);if (FAILED(hr)) {error_report("WHPX: Failed to setup partition, hr=%08lx", hr);ret = -EINVAL;goto error;}whpx_memory_init();printf("Windows Hypervisor Platform accelerator is operational\n");return 0;error:if (NULL != whpx->partition) {whp_dispatch.WHvDeletePartition(whpx->partition);whpx->partition = NULL;}return ret;
}

函数 init_whp_dispatch() 在 /target/i386/whpx/whpx-all.c 文件中，定义如下：

bool init_whp_dispatch(void)
{if (whp_dispatch_initialized) {return true;}if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) {goto error;}if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) {goto error;}assert(load_whp_dispatch_fns(&hWinHvPlatform,WINHV_PLATFORM_FNS_SUPPLEMENTAL));whp_dispatch_initialized = true;return true;
error:if (hWinHvPlatform) {FreeLibrary(hWinHvPlatform);}if (hWinHvEmulation) {FreeLibrary(hWinHvEmulation);}return false;
}

函数 load_whp_dispatch_fns() 定义如下：

/** Load the functions from the given library, using the given handle. If a* handle is provided, it is used, otherwise the library is opened. The* handle will be updated on return with the opened one.*/
static bool load_whp_dispatch_fns(HMODULE *handle,WHPFunctionList function_list)
{HMODULE hLib = *handle;#define WINHV_PLATFORM_DLL "WinHvPlatform.dll"#define WINHV_EMULATION_DLL "WinHvEmulation.dll"#define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \whp_dispatch.function_name = \(function_name ## _t)GetProcAddress(hLib, #function_name); \
#define WHP_LOAD_FIELD(return_type, function_name, signature) \whp_dispatch.function_name = \(function_name ## _t)GetProcAddress(hLib, #function_name); \if (!whp_dispatch.function_name) { \error_report("Could not load function %s", #function_name); \goto error; \} \
#define WHP_LOAD_LIB(lib_name, handle_lib) \if (!handle_lib) { \handle_lib = LoadLibrary(lib_name); \if (!handle_lib) { \error_report("Could not load library %s.", lib_name); \goto error; \} \} \
switch (function_list) {case WINHV_PLATFORM_FNS_DEFAULT:WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD)break;case WINHV_EMULATION_FNS_DEFAULT:WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib)LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD)break;case WINHV_PLATFORM_FNS_SUPPLEMENTAL:WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL)break;}*handle = hLib;return true;error:if (hLib) {FreeLibrary(hLib);}return false;
}

WHPX 提供的函数类型列表如下：

typedef enum WHPFunctionList {WINHV_PLATFORM_FNS_DEFAULT,WINHV_EMULATION_FNS_DEFAULT,WINHV_PLATFORM_FNS_SUPPLEMENTAL
} WHPFunctionList;

phase_advance(PHASE_ACCEL_CREATED)

接下来，将机器状态置为 PHASE_ACCEL_CREATED，实现代码如下：

    phase_advance(PHASE_ACCEL_CREATED);

函数 phase_advance() 定义如下：

void phase_advance(MachineInitPhase phase)
{assert(machine_phase == phase - 1);machine_phase = phase;
}

机器状态 machine_phase 的数据类型定义如下：

typedef enum MachineInitPhase {/* current_machine is NULL.  */PHASE_NO_MACHINE,/* current_machine is not NULL, but current_machine->accel is NULL.  */PHASE_MACHINE_CREATED,/** current_machine->accel is not NULL, but the machine properties have* not been validated and machine_class->init has not yet been called.*/PHASE_ACCEL_CREATED,/** machine_class->init has been called, thus creating any embedded* devices and validating machine properties.  Devices created at* this time are considered to be cold-plugged.*/PHASE_MACHINE_INITIALIZED,/** QEMU is ready to start CPUs and devices created at this time* are considered to be hot-plugged.  The monitor is not restricted* to "preconfig" commands.*/PHASE_MACHINE_READY,
} MachineInitPhase;