[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [RFC PATCH 3/3] spapr: implement nested-hv support for the TCG virtu
From: |
Fabiano Rosas |
Subject: |
Re: [RFC PATCH 3/3] spapr: implement nested-hv support for the TCG virtual hypervisor |
Date: |
Mon, 14 Feb 2022 11:32:21 -0300 |
Nicholas Piggin <npiggin@gmail.com> writes:
> This implements the nested-hv hcall API for spapr under TCG.
> It's still a bit rough around the edges, concept seems to work.
>
> Some HV exceptions can be raised now in the TCG spapr machine when
> running a nested guest. The main ones are the lev==1 syscall, the
> hdecr, hdsi and hisi, and h_virt external interrupts. These are
> dealt with in the interrupt delivery code by noticing MSR[HV] raised
> and instead of switching the machine to HV mode, it exits the
> H_ENTER_NESTED hcall with the interrupt vector as return value as
> required by the hcall API.
>
> Address translation is provided by the 2-level page table walker
> that is implemented for the pnv machine. The partition scope page
> table is pointed to the L1's partition scope, and a few tests have
> to take into account that nested-hv translations are 2-level. This
> could perhaps be tidied up a bit e.g., with a 'bool two_level = ...'
> but it's surprisingly little code.
>
> There is no TLB tagging between L1 and L2 translations at the moment
> so the TLB is flushed on any L1<->L2 transition (hcall entry and exit).
>
> XXX: stop doing atomic RC on page table walks (not for nested but in general)
>
> not-yet-Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Hi Nick,
Sorry it took me so long to get to this. It think overall it looks
good. I have some comments about code structure that you probably
already know about.
I don't have answers to your XXX questions but I think it's ok to handle
them later individually as they come up, after this is merged.
> ---
> hw/ppc/ppc.c | 20 +++
> hw/ppc/spapr.c | 16 ++
> hw/ppc/spapr_caps.c | 5 +-
> hw/ppc/spapr_hcall.c | 316 +++++++++++++++++++++++++++++++++++++
> include/hw/ppc/ppc.h | 3 +
> include/hw/ppc/spapr.h | 75 ++++++++-
> target/ppc/cpu.h | 6 +
> target/ppc/excp_helper.c | 60 ++++---
> target/ppc/helper_regs.c | 1 +
> target/ppc/mmu-book3s-v3.c | 20 ++-
> target/ppc/mmu-radix64.c | 15 +-
> 11 files changed, 499 insertions(+), 38 deletions(-)
>
> diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
> index a7c262db93..135900a6f4 100644
> --- a/hw/ppc/ppc.c
> +++ b/hw/ppc/ppc.c
> @@ -1083,6 +1083,26 @@ clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env,
> uint32_t freq)
> return &cpu_ppc_set_tb_clk;
> }
>
> +void cpu_ppc_hdecr_init (CPUPPCState *env)
> +{
> + PowerPCCPU *cpu = env_archcpu(env);
> +
> + assert(env->tb_env->hdecr_timer == NULL);
> +
> + env->tb_env->hdecr_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
> &cpu_ppc_hdecr_cb,
> + cpu);
> +}
> +
> +void cpu_ppc_hdecr_exit (CPUPPCState *env)
> +{
> + PowerPCCPU *cpu = env_archcpu(env);
> +
> + timer_free(env->tb_env->hdecr_timer);
> + env->tb_env->hdecr_timer = NULL;
> +
> + cpu_ppc_hdecr_lower(cpu);
> +}
> +
> /* Specific helpers for POWER & PowerPC 601 RTC */
> void cpu_ppc601_store_rtcu (CPUPPCState *env, uint32_t value)
> {
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 3d6ec309dd..f0c3f726f2 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -1273,6 +1273,8 @@ static void
> emulate_spapr_hypercall(PPCVirtualHypervisor *vhyp,
> if (msr_pr) {
> hcall_dprintf("Hypercall made with MSR[PR]=1\n");
> env->gpr[3] = H_PRIVILEGE;
> + } else if (env->gpr[3] == KVMPPC_H_ENTER_NESTED) {
> + spapr_enter_nested(cpu);
This looks like it could be plumbed along with spapr_hypercall below.
> } else {
> env->gpr[3] = spapr_hypercall(cpu, env->gpr[3], &env->gpr[4]);
> }
> @@ -4465,6 +4467,17 @@ PowerPCCPU *spapr_find_cpu(int vcpu_id)
> return NULL;
> }
>
> +static bool spapr_cpu_in_nested(PowerPCCPU *cpu)
> +{
> + return cpu->in_spapr_nested;
> +}
> +
> +static target_ulong spapr_get_nested_ptcr(PowerPCCPU *cpu, target_ulong lpid)
> +{
> + SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> + return spapr->nested_ptcr;
> +}
> +
> static void spapr_cpu_exec_enter(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu)
> {
> SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
> @@ -4573,6 +4586,9 @@ static void spapr_machine_class_init(ObjectClass *oc,
> void *data)
> fwc->get_dev_path = spapr_get_fw_dev_path;
> nc->nmi_monitor_handler = spapr_nmi;
> smc->phb_placement = spapr_phb_placement;
> + vhc->cpu_in_nested = spapr_cpu_in_nested;
> + vhc->get_nested_ptcr = spapr_get_nested_ptcr;
> + vhc->exit_nested = spapr_exit_nested;
> vhc->hypercall = emulate_spapr_hypercall;
> vhc->hpt_mask = spapr_hpt_mask;
> vhc->map_hptes = spapr_map_hptes;
> diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c
> index ed7c077a0d..a665245f6f 100644
> --- a/hw/ppc/spapr_caps.c
> +++ b/hw/ppc/spapr_caps.c
> @@ -450,10 +450,7 @@ static void cap_nested_kvm_hv_apply(SpaprMachineState
> *spapr,
> return;
> }
>
> - if (tcg_enabled()) {
> - error_setg(errp, "No Nested KVM-HV support in TCG");
> - error_append_hint(errp, "Try appending -machine
> cap-nested-hv=off\n");
> - } else if (kvm_enabled()) {
> + if (!tcg_enabled()) {
> if (!ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_3_00, 0,
> spapr->max_compat_pvr)) {
> error_setg(errp, "Nested KVM-HV only supported on POWER9");
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index 222c1b6bbd..8ffb13ada0 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -9,6 +9,7 @@
> #include "qemu/error-report.h"
> #include "exec/exec-all.h"
> #include "helper_regs.h"
> +#include "hw/ppc/ppc.h"
> #include "hw/ppc/spapr.h"
> #include "hw/ppc/spapr_cpu_core.h"
> #include "mmu-hash64.h"
> @@ -1497,6 +1498,317 @@ static void hypercall_register_softmmu(void)
> }
> #endif
>
> +/* TCG only */
> +#define PRTS_MASK 0x1f
> +
> +static target_ulong h_set_ptbl(PowerPCCPU *cpu,
> + SpaprMachineState *spapr,
> + target_ulong opcode,
> + target_ulong *args)
> +{
> + target_ulong ptcr = args[0];
> +
> + if (!spapr_get_cap(spapr, SPAPR_CAP_NESTED_KVM_HV)) {
> + return H_FUNCTION;
> + }
> +
> + if ((ptcr & PRTS_MASK) + 12 - 4 > 12) {
> + return H_PARAMETER;
> + }
> +
> + spapr->nested_ptcr = ptcr; /* Save new partition table */
> +
> + return H_SUCCESS;
> +}
> +
> +static target_ulong h_tlb_invalidate(PowerPCCPU *cpu,
> + SpaprMachineState *spapr,
> + target_ulong opcode,
> + target_ulong *args)
> +{
> + CPUState *cs = CPU(cpu);
> +
> + /*
> + * The spapr virtual hypervisor nested HV implementation retains no
> + * translation state except for TLB. This might be optimised to
> + * invalidate fewer entries, but at the moment it's not important
> + * because L1<->L2 transitions always flush the entire TLB for now.
> + */
> + tlb_flush(cs);
> +
> + return H_SUCCESS;
> +}
> +
> +static target_ulong h_copy_tofrom_guest(PowerPCCPU *cpu,
> + SpaprMachineState *spapr,
> + target_ulong opcode,
> + target_ulong *args)
> +{
> + /*
> + * This HCALL is not required, L1 KVM will take a slow path and walk the
> + * page tables manually to do the data copy.
> + */
Kind of unrelated to the patch itself but I thought the point of this
hcall was less about performance and more about Ln not having access to
the KVM memslots for the Ln+2.
KVM accesses a guest address space by using the GFN to find the memory
slot and from there the userspace address that is part of the
memslot. Accesses from Ln -> Ln+1 can use that mechanism because any Ln
will have slots registered for its guests.
For L0 -> L2, we can go via radix quadrants because the L0 is the real
HV (that's how L0 handles this hcall after all).
For L1 -> L3, we need the hcall since L1 cannot access quadrants 1 and 2
and it does not have the memslot so it cannot use the QEMU address.
Does that make sense?
> + return H_FUNCTION;
> +}
> +
> +void spapr_enter_nested(PowerPCCPU *cpu)
> +{
> + SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> + PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
> + CPUState *cs = CPU(cpu);
> + CPUPPCState *env = &cpu->env;
> + target_ulong hv_ptr = env->gpr[4];
> + target_ulong regs_ptr = env->gpr[5];
> + target_ulong hdec, now = cpu_ppc_load_tbl(env);
> + struct kvmppc_hv_guest_state *hvstate;
> + struct kvmppc_hv_guest_state hv_state;
> + struct kvmppc_pt_regs *regs;
> + hwaddr len;
> + uint32_t cr;
> + int i;
> +
> + if (cpu->in_spapr_nested) {
> + env->gpr[3] = H_FUNCTION;
> + return;
> + }
> + if (spapr->nested_ptcr == 0) {
> + env->gpr[3] = H_NOT_AVAILABLE;
> + return;
> + }
> +
> + len = sizeof(*hvstate);
> + hvstate = cpu_physical_memory_map(hv_ptr, &len, true);
> + if (!hvstate || len != sizeof(*hvstate)) {
> + env->gpr[3] = H_PARAMETER;
> + return;
> + }
> +
> + memcpy(&hv_state, hvstate, len);
> +
> + cpu_physical_memory_unmap(hvstate, len, 0 /* read */, len /* access len
> */);
> +
> + if (hv_state.version != HV_GUEST_STATE_VERSION) {
KVM uses > here. Should we do the same? Daniel and I have been
discussing if we need some sort of version negotiation for nested KVM
since recently someone tried to use an L0 version 1 with an L1 version
2:
https://gitlab.com/qemu-project/qemu/-/issues/860
> + env->gpr[3] = H_PARAMETER;
> + return;
> + }
> +
<snip>
> +/* 64-bit powerpc pt_regs struct used by nested HV */
> +struct kvmppc_pt_regs {
> + uint64_t gpr[32];
> + uint64_t nip;
> + uint64_t msr;
> + uint64_t orig_gpr3; /* Used for restarting system calls */
> + uint64_t ctr;
> + uint64_t link;
> + uint64_t xer;
> + uint64_t ccr;
> + uint64_t softe; /* Soft enabled/disabled */
> + uint64_t trap; /* Reason for being here */
> + uint64_t dar; /* Fault registers */
> + uint64_t dsisr; /* on 4xx/Book-E used for ESR */
There's no 4xx/booke support for spapr nested.
> + uint64_t result; /* Result of a system call */
> +};
>
> typedef struct SpaprDeviceTreeUpdateHeader {
> uint32_t version_id;
> @@ -604,6 +673,10 @@ typedef target_ulong (*spapr_hcall_fn)(PowerPCCPU *cpu,
> SpaprMachineState *sm,
> void spapr_register_hypercall(target_ulong opcode, spapr_hcall_fn fn);
> target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
> target_ulong *args);
> +
> +void spapr_enter_nested(PowerPCCPU *cpu);
> +void spapr_exit_nested(PowerPCCPU *cpu, int excp);
> +
> target_ulong softmmu_resize_hpt_prepare(PowerPCCPU *cpu, SpaprMachineState
> *spapr,
> target_ulong shift);
> target_ulong softmmu_resize_hpt_commit(PowerPCCPU *cpu, SpaprMachineState
> *spapr,
> diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
> index dcd83b503c..1806a8e776 100644
> --- a/target/ppc/cpu.h
> +++ b/target/ppc/cpu.h
> @@ -1306,6 +1306,9 @@ struct PowerPCCPU {
> bool pre_2_10_migration;
> bool pre_3_0_migration;
> int32_t mig_slb_nr;
> +
> + bool in_spapr_nested;
> + CPUPPCState *nested_host_state;
> };
>
>
> @@ -1316,6 +1319,9 @@ PowerPCCPUClass
> *ppc_cpu_get_family_class(PowerPCCPUClass *pcc);
> #ifndef CONFIG_USER_ONLY
> struct PPCVirtualHypervisorClass {
> InterfaceClass parent;
> + bool (*cpu_in_nested)(PowerPCCPU *cpu);
> + target_ulong (*get_nested_ptcr)(PowerPCCPU *cpu, target_ulong lpid);
> + void (*exit_nested)(PowerPCCPU *cpu, int excp);
> void (*hypercall)(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu);
> hwaddr (*hpt_mask)(PPCVirtualHypervisor *vhyp);
> const ppc_hash_pte64_t *(*map_hptes)(PPCVirtualHypervisor *vhyp,
> diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
> index c107953dec..239c253dbc 100644
> --- a/target/ppc/excp_helper.c
> +++ b/target/ppc/excp_helper.c
> @@ -361,8 +361,8 @@ static void ppc_excp_apply_ail(PowerPCCPU *cpu, int
> excp_model, int excp,
> #endif
> }
>
> -static void powerpc_set_excp_state(PowerPCCPU *cpu,
> - target_ulong vector, target_ulong
> msr)
> +static void powerpc_set_excp_state(PowerPCCPU *cpu, int excp,
> + target_ulong vector, target_ulong msr)
> {
> CPUState *cs = CPU(cpu);
> CPUPPCState *env = &cpu->env;
> @@ -375,9 +375,17 @@ static void powerpc_set_excp_state(PowerPCCPU *cpu,
> * will prevent setting of the HV bit which some exceptions might need
> * to do.
> */
> - env->msr = msr & env->msr_mask;
> - hreg_compute_hflags(env);
> - env->nip = vector;
> + if (cpu->vhyp && cpu->in_spapr_nested && (msr & MSR_HVB)) {
> + PPCVirtualHypervisorClass *vhc =
> + PPC_VIRTUAL_HYPERVISOR_GET_CLASS(cpu->vhyp);
> + // Deliver interrupt to L1 by returning from the H_ENTER_NESTED call
> + vhc->exit_nested(cpu, excp);
> + } else {
> + env->nip = vector;
> + env->msr = msr & env->msr_mask;
> + hreg_compute_hflags(env);
> + }
> +
All of this should fit better at the end of powerpc_excp_books. Even if
we need to duplicate it for fwnmi_machine_check.
> /* Reset exception state */
> cs->exception_index = POWERPC_EXCP_NONE;
> env->error_code = 0;
> @@ -548,7 +556,7 @@ static void powerpc_excp_40x(PowerPCCPU *cpu, int excp)
> /* Save MSR */
> env->spr[srr1] = msr;
>
> - powerpc_set_excp_state(cpu, vector, new_msr);
> + powerpc_set_excp_state(cpu, excp, vector, new_msr);
> }
>
> static void powerpc_excp_74xx(PowerPCCPU *cpu, int excp)
> @@ -742,7 +750,7 @@ static void powerpc_excp_74xx(PowerPCCPU *cpu, int excp)
> /* Save MSR */
> env->spr[SPR_SRR1] = msr;
>
> - powerpc_set_excp_state(cpu, vector, new_msr);
> + powerpc_set_excp_state(cpu, excp, vector, new_msr);
> }
>
> #ifdef TARGET_PPC64
> @@ -916,7 +924,7 @@ static void powerpc_excp_books(PowerPCCPU *cpu, int excp)
> env->nip += 4;
>
> /* "PAPR mode" built-in hypercall emulation */
> - if ((lev == 1) && cpu->vhyp) {
> + if ((lev == 1) && cpu->vhyp && !cpu->in_spapr_nested) {
> PPCVirtualHypervisorClass *vhc =
> PPC_VIRTUAL_HYPERVISOR_GET_CLASS(cpu->vhyp);
> vhc->hypercall(cpu->vhyp, cpu);
> @@ -1004,18 +1012,6 @@ static void powerpc_excp_books(PowerPCCPU *cpu, int
> excp)
> break;
> }
>
> - /* Sanity check */
> - if (!(env->msr_mask & MSR_HVB)) {
> - if (new_msr & MSR_HVB) {
> - cpu_abort(cs, "Trying to deliver HV exception (MSR) %d with "
> - "no HV support\n", excp);
> - }
> - if (srr0 == SPR_HSRR0) {
> - cpu_abort(cs, "Trying to deliver HV exception (HSRR) %d with "
> - "no HV support\n", excp);
> - }
> - }
> -
> /*
> * Sort out endianness of interrupt, this differs depending on the
> * CPU, the HV mode, etc...
> @@ -1037,7 +1033,19 @@ static void powerpc_excp_books(PowerPCCPU *cpu, int
> excp)
> /* This can update new_msr and vector if AIL applies */
> ppc_excp_apply_ail(cpu, excp_model, excp, msr, &new_msr, &vector);
>
> - powerpc_set_excp_state(cpu, vector, new_msr);
> + powerpc_set_excp_state(cpu, excp, vector, new_msr);
> +
> + /* Sanity check */
> + if (!(env->msr_mask & MSR_HVB)) {
> + if (env->msr & MSR_HVB) {
> + cpu_abort(cs, "Trying to deliver HV exception (MSR) %d with "
> + "no HV support\n", excp);
> + }
FYI, this doesn't exist anymore.
> + if (0 && srr0 == SPR_HSRR0) {
> + cpu_abort(cs, "Trying to deliver HV exception (HSRR) %d with "
> + "no HV support\n", excp);
> + }
msr_mask should have MSR_HVB independently of MSR_HV being actually set,
so this shouldn't be causing any problems. Is it?
> + }
> }
> #else
> static inline void powerpc_excp_books(PowerPCCPU *cpu, int excp)
> @@ -1517,7 +1525,7 @@ static inline void powerpc_excp_legacy(PowerPCCPU *cpu,
> int excp)
> /* This can update new_msr and vector if AIL applies */
> ppc_excp_apply_ail(cpu, excp_model, excp, msr, &new_msr, &vector);
>
> - powerpc_set_excp_state(cpu, vector, new_msr);
> + powerpc_set_excp_state(cpu, excp, vector, new_msr);
> }
>
> static void powerpc_excp(PowerPCCPU *cpu, int excp)
> @@ -1613,7 +1621,11 @@ static void ppc_hw_interrupt(CPUPPCState *env)
> /* HEIC blocks delivery to the hypervisor */
> if ((async_deliver && !(heic && msr_hv && !msr_pr)) ||
> (env->has_hv_mode && msr_hv == 0 && !lpes0)) {
> - powerpc_excp(cpu, POWERPC_EXCP_EXTERNAL);
> + if (cpu->in_spapr_nested) {
> + powerpc_excp(cpu, POWERPC_EXCP_HVIRT);
> + } else {
> + powerpc_excp(cpu, POWERPC_EXCP_EXTERNAL);
> + }
This function affects other CPUs, so it's better to leave this as it
were and change the exception inside powerpc_excp_books, like we do for
HV_EMUL.
> return;
> }
> }
> @@ -1723,7 +1735,7 @@ void ppc_cpu_do_fwnmi_machine_check(CPUState *cs,
> target_ulong vector)
> msr |= (1ULL << MSR_LE);
> }
>
> - powerpc_set_excp_state(cpu, vector, msr);
> + powerpc_set_excp_state(cpu, POWERPC_EXCP_MCHECK, vector, msr);
> }
>
> bool ppc_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
> diff --git a/target/ppc/helper_regs.c b/target/ppc/helper_regs.c
> index 5b12cb03c9..7f92606522 100644
> --- a/target/ppc/helper_regs.c
> +++ b/target/ppc/helper_regs.c
> @@ -163,6 +163,7 @@ static uint32_t hreg_compute_hflags_value(CPUPPCState
> *env)
> immu_idx |= msr & (1 << MSR_IS) ? 2 : 0;
> dmmu_idx |= msr & (1 << MSR_DS) ? 2 : 0;
> } else {
> + /* Could have nested IDX instead of HV to avoid tlb flush on nested
> enter/exit? */
Sounds like a good idea. However I don't know enough about softmmu stuff
to point a direction here.
> dmmu_idx |= msr & (1ull << MSR_HV) ? 4 : 0;
> immu_idx = dmmu_idx;
> immu_idx |= msr & (1 << MSR_IR) ? 0 : 2;