qemu-ppc
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH 2/4] target/ppc: Add recording of taken branches to BHRB


From: Nicholas Piggin
Subject: Re: [PATCH 2/4] target/ppc: Add recording of taken branches to BHRB
Date: Fri, 15 Sep 2023 11:02:48 +1000

On Wed Sep 13, 2023 at 6:24 AM AEST, Glenn Miles wrote:
> This commit continues adding support for the Branch History
> Rolling Buffer (BHRB) as is provided starting with the P8
> processor and continuing with its successors.  This commit
> is limited to the recording and filtering of taken branches.
>
> The following changes were made:
>
>   - Added a BHRB buffer for storing branch instruction and
>     target addresses for taken branches
>   - Renamed gen_update_cfar to gen_update_branch_history and
>     added a 'target' parameter to hold the branch target
>     address and 'inst_type' parameter to use for filtering
>   - Added a combination of jit-time and run-time checks to
>     gen_update_branch_history for determining if a branch
>     should be recorded
>   - Added TCG code to gen_update_branch_history that stores
>     data to the BHRB and updates the BHRB offset.
>   - Added BHRB resource initialization and reset functions
>   - Enabled functionality for P8, P9 and P10 processors.
>
> Signed-off-by: Glenn Miles <milesg@linux.vnet.ibm.com>
> ---
>  target/ppc/cpu.h                       |  18 +++-
>  target/ppc/cpu_init.c                  |  41 ++++++++-
>  target/ppc/helper_regs.c               |  32 +++++++
>  target/ppc/helper_regs.h               |   1 +
>  target/ppc/power8-pmu.c                |   2 +
>  target/ppc/power8-pmu.h                |   7 ++
>  target/ppc/translate.c                 | 114 +++++++++++++++++++++++--
>  target/ppc/translate/branch-impl.c.inc |   2 +-
>  8 files changed, 205 insertions(+), 12 deletions(-)
>
> diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
> index 20ae1466a5..bda1afb700 100644
> --- a/target/ppc/cpu.h
> +++ b/target/ppc/cpu.h
> @@ -454,8 +454,9 @@ FIELD(MSR, LE, MSR_LE, 1)
>  #define MMCR2_UREG_MASK (MMCR2_FC1P0 | MMCR2_FC2P0 | MMCR2_FC3P0 | \
>                           MMCR2_FC4P0 | MMCR2_FC5P0 | MMCR2_FC6P0)
>  
> -#define MMCRA_BHRBRD    PPC_BIT(26)            /* BHRB Recording Disable */
> -
> +#define MMCRA_BHRBRD    PPC_BIT(26)         /* BHRB Recording Disable */

Fold this tidying into patch 1.

> +#define MMCRA_IFM_MASK  PPC_BITMASK(32, 33) /* BHRB Instruction Filtering */
> +#define MMCRA_IFM_SHIFT PPC_BIT_NR(33)
>  
>  #define MMCR1_EVT_SIZE 8
>  /* extract64() does a right shift before extracting */
> @@ -682,6 +683,8 @@ enum {
>      POWERPC_FLAG_SMT      = 0x00400000,
>      /* Using "LPAR per core" mode  (as opposed to per-thread)                
> */
>      POWERPC_FLAG_SMT_1LPAR = 0x00800000,
> +    /* Has BHRB */
> +    POWERPC_FLAG_BHRB      = 0x01000000,
>  };

Interesting question of which patch to add different flags. I'm
strongly in add when you add code that uses them like this one,
but it's a matter of taste and not always practical to be an
absolute rule. I don't mind too much what others do, but maybe
this and the pcc->flags init should go in patch 1 since that's adding
flags that aren't yet used?

>  
>  /*
> @@ -1110,6 +1113,9 @@ DEXCR_ASPECT(PHIE, 6)
>  #define PPC_CPU_OPCODES_LEN          0x40
>  #define PPC_CPU_INDIRECT_OPCODES_LEN 0x20
>  
> +#define BHRB_MAX_NUM_ENTRIES_LOG2 (5)
> +#define BHRB_MAX_NUM_ENTRIES      (1 << BHRB_MAX_NUM_ENTRIES_LOG2)
> +
>  struct CPUArchState {
>      /* Most commonly used resources during translated code execution first */
>      target_ulong gpr[32];  /* general purpose registers */
> @@ -1196,6 +1202,14 @@ struct CPUArchState {
>      int dcache_line_size;
>      int icache_line_size;
>  
> +    /* Branch History Rolling Buffer (BHRB) resources */
> +    target_ulong bhrb_num_entries;
> +    target_ulong bhrb_base;
> +    target_ulong bhrb_filter;
> +    target_ulong bhrb_offset;
> +    target_ulong bhrb_offset_mask;
> +    uint64_t bhrb[BHRB_MAX_NUM_ENTRIES];

Put these under ifdef TARGET_PPC64?

> +
>      /* These resources are used during exception processing */
>      /* CPU model definition */
>      target_ulong msr_mask;
> diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
> index 568f9c3b88..19d7505a73 100644
> --- a/target/ppc/cpu_init.c
> +++ b/target/ppc/cpu_init.c
> @@ -6100,6 +6100,28 @@ POWERPC_FAMILY(POWER7)(ObjectClass *oc, void *data)
>      pcc->l1_icache_size = 0x8000;
>  }
>  
> +static void bhrb_init_state(CPUPPCState *env, target_long num_entries_log2)
> +{
> +    if (env->flags & POWERPC_FLAG_BHRB) {
> +        if (num_entries_log2 > BHRB_MAX_NUM_ENTRIES_LOG2) {
> +            num_entries_log2 = BHRB_MAX_NUM_ENTRIES_LOG2;
> +        }
> +        env->bhrb_num_entries = 1 << num_entries_log2;
> +        env->bhrb_base = (target_long)&env->bhrb[0];
> +        env->bhrb_offset_mask = (env->bhrb_num_entries * sizeof(uint64_t)) - 
> 1;
> +    }
> +}
> +
> +static void bhrb_reset_state(CPUPPCState *env)
> +{
> +    if (env->flags & POWERPC_FLAG_BHRB) {
> +        env->bhrb_offset = 0;
> +        env->bhrb_filter = 0;
> +        memset(env->bhrb, 0, sizeof(env->bhrb));
> +    }
> +}
> +
> +#define POWER8_BHRB_ENTRIES_LOG2 5
>  static void init_proc_POWER8(CPUPPCState *env)
>  {
>      /* Common Registers */
> @@ -6141,6 +6163,8 @@ static void init_proc_POWER8(CPUPPCState *env)
>      env->dcache_line_size = 128;
>      env->icache_line_size = 128;
>  
> +    bhrb_init_state(env, POWER8_BHRB_ENTRIES_LOG2);
> +
>      /* Allocate hardware IRQ controller */
>      init_excp_POWER8(env);
>      ppcPOWER7_irq_init(env_archcpu(env));
> @@ -6241,7 +6265,8 @@ POWERPC_FAMILY(POWER8)(ObjectClass *oc, void *data)
>      pcc->flags = POWERPC_FLAG_VRE | POWERPC_FLAG_SE |
>                   POWERPC_FLAG_BE | POWERPC_FLAG_PMM |
>                   POWERPC_FLAG_BUS_CLK | POWERPC_FLAG_CFAR |
> -                 POWERPC_FLAG_VSX | POWERPC_FLAG_TM;
> +                 POWERPC_FLAG_VSX | POWERPC_FLAG_TM |
> +                 POWERPC_FLAG_BHRB;
>      pcc->l1_dcache_size = 0x8000;
>      pcc->l1_icache_size = 0x8000;
>  }
> @@ -6265,6 +6290,7 @@ static struct ppc_radix_page_info 
> POWER9_radix_page_info = {
>  };
>  #endif /* CONFIG_USER_ONLY */
>  
> +#define POWER9_BHRB_ENTRIES_LOG2 5
>  static void init_proc_POWER9(CPUPPCState *env)
>  {
>      /* Common Registers */
> @@ -6315,6 +6341,8 @@ static void init_proc_POWER9(CPUPPCState *env)
>      env->dcache_line_size = 128;
>      env->icache_line_size = 128;
>  
> +    bhrb_init_state(env, POWER9_BHRB_ENTRIES_LOG2);
> +
>      /* Allocate hardware IRQ controller */
>      init_excp_POWER9(env);
>      ppcPOWER9_irq_init(env_archcpu(env));
> @@ -6434,7 +6462,8 @@ POWERPC_FAMILY(POWER9)(ObjectClass *oc, void *data)
>      pcc->flags = POWERPC_FLAG_VRE | POWERPC_FLAG_SE |
>                   POWERPC_FLAG_BE | POWERPC_FLAG_PMM |
>                   POWERPC_FLAG_BUS_CLK | POWERPC_FLAG_CFAR |
> -                 POWERPC_FLAG_VSX | POWERPC_FLAG_TM | POWERPC_FLAG_SCV;
> +                 POWERPC_FLAG_VSX | POWERPC_FLAG_TM | POWERPC_FLAG_SCV |
> +                 POWERPC_FLAG_BHRB;
>      pcc->l1_dcache_size = 0x8000;
>      pcc->l1_icache_size = 0x8000;
>  }
> @@ -6458,6 +6487,7 @@ static struct ppc_radix_page_info 
> POWER10_radix_page_info = {
>  };
>  #endif /* !CONFIG_USER_ONLY */
>  
> +#define POWER10_BHRB_ENTRIES_LOG2 5
>  static void init_proc_POWER10(CPUPPCState *env)
>  {
>      /* Common Registers */
> @@ -6505,6 +6535,8 @@ static void init_proc_POWER10(CPUPPCState *env)
>      env->dcache_line_size = 128;
>      env->icache_line_size = 128;
>  
> +    bhrb_init_state(env, POWER10_BHRB_ENTRIES_LOG2);
> +
>      /* Allocate hardware IRQ controller */
>      init_excp_POWER10(env);
>      ppcPOWER9_irq_init(env_archcpu(env));
> @@ -6610,7 +6642,8 @@ POWERPC_FAMILY(POWER10)(ObjectClass *oc, void *data)
>      pcc->flags = POWERPC_FLAG_VRE | POWERPC_FLAG_SE |
>                   POWERPC_FLAG_BE | POWERPC_FLAG_PMM |
>                   POWERPC_FLAG_BUS_CLK | POWERPC_FLAG_CFAR |
> -                 POWERPC_FLAG_VSX | POWERPC_FLAG_TM | POWERPC_FLAG_SCV;
> +                 POWERPC_FLAG_VSX | POWERPC_FLAG_TM | POWERPC_FLAG_SCV |
> +                 POWERPC_FLAG_BHRB;
>      pcc->l1_dcache_size = 0x8000;
>      pcc->l1_icache_size = 0x8000;
>  }
> @@ -7178,6 +7211,8 @@ static void ppc_cpu_reset_hold(Object *obj)
>          }
>          env->spr[i] = spr->default_value;
>      }
> +
> +    bhrb_reset_state(env);
>  }
>  
>  #ifndef CONFIG_USER_ONLY
> diff --git a/target/ppc/helper_regs.c b/target/ppc/helper_regs.c
> index 4ff054063d..68b27196d9 100644
> --- a/target/ppc/helper_regs.c
> +++ b/target/ppc/helper_regs.c
> @@ -752,3 +752,35 @@ void register_usprgh_sprs(CPUPPCState *env)
>                   &spr_read_ureg, SPR_NOACCESS,
>                   0x00000000);
>  }
> +
> +void hreg_bhrb_filter_update(CPUPPCState *env)
> +{
> +    target_long ifm;
> +
> +    if (!(env->spr[SPR_POWER_MMCR0] & MMCR0_PMAE)) {
> +        /* disable recording to BHRB */
> +        env->bhrb_filter = BHRB_TYPE_NORECORD;
> +        return;
> +    }
> +
> +    ifm = (env->spr[SPR_POWER_MMCRA] & MMCRA_IFM_MASK) >> MMCRA_IFM_SHIFT;
> +    switch (ifm) {
> +    case 0:
> +        /* record all branches */
> +        env->bhrb_filter = -1;
> +        break;
> +    case 1:
> +        /* only record calls (LK = 1) */
> +        env->bhrb_filter = BHRB_TYPE_CALL;
> +        break;
> +    case 2:
> +        /* only record indirect branches */
> +        env->bhrb_filter = BHRB_TYPE_INDIRECT;
> +        break;
> +    case 3:
> +        /* only record conditional branches */
> +        env->bhrb_filter = BHRB_TYPE_COND;
> +        break;
> +    }
> +}
> +
> diff --git a/target/ppc/helper_regs.h b/target/ppc/helper_regs.h
> index 8196c1346d..ab6aba1c24 100644
> --- a/target/ppc/helper_regs.h
> +++ b/target/ppc/helper_regs.h
> @@ -25,6 +25,7 @@ void hreg_compute_hflags(CPUPPCState *env);
>  void hreg_update_pmu_hflags(CPUPPCState *env);
>  void cpu_interrupt_exittb(CPUState *cs);
>  int hreg_store_msr(CPUPPCState *env, target_ulong value, int alter_hv);
> +void hreg_bhrb_filter_update(CPUPPCState *env);
>  
>  #ifdef CONFIG_USER_ONLY
>  static inline void check_tlb_flush(CPUPPCState *env, bool global) { }
> diff --git a/target/ppc/power8-pmu.c b/target/ppc/power8-pmu.c
> index 6f5d4e1256..18149a15b2 100644
> --- a/target/ppc/power8-pmu.c
> +++ b/target/ppc/power8-pmu.c
> @@ -261,6 +261,7 @@ void helper_store_mmcr0(CPUPPCState *env, target_ulong 
> value)
>      env->spr[SPR_POWER_MMCR0] = value;
>  
>      pmu_mmcr01a_updated(env);
> +    hreg_bhrb_filter_update(env);
>  
>      /* Update cycle overflow timers with the current MMCR0 state */
>      pmu_update_overflow_timers(env);
> @@ -280,6 +281,7 @@ void helper_store_mmcrA(CPUPPCState *env, uint64_t value)
>      env->spr[SPR_POWER_MMCRA] = value;
>  
>      pmu_mmcr01a_updated(env);
> +    hreg_bhrb_filter_update(env);
>  }
>  

Can these just be moved into pmu_mmcr01a_updated? AFAIKS they only
depend on MMCR0/A.

>  target_ulong helper_read_pmc(CPUPPCState *env, uint32_t sprn)
> diff --git a/target/ppc/power8-pmu.h b/target/ppc/power8-pmu.h
> index 87fa8c9334..a887094045 100644
> --- a/target/ppc/power8-pmu.h
> +++ b/target/ppc/power8-pmu.h
> @@ -17,6 +17,13 @@
>  
>  #define PMC_COUNTER_NEGATIVE_VAL 0x80000000UL
>  
> +#define BHRB_TYPE_NORECORD      0x00
> +#define BHRB_TYPE_CALL          0x01
> +#define BHRB_TYPE_INDIRECT      0x02
> +#define BHRB_TYPE_COND          0x04
> +#define BHRB_TYPE_OTHER         0x08
> +#define BHRB_TYPE_XL_FORM       0x10
> +
>  void cpu_ppc_pmu_init(CPUPPCState *env);
>  void pmu_mmcr01a_updated(CPUPPCState *env);
>  #else
> diff --git a/target/ppc/translate.c b/target/ppc/translate.c
> index d93fbd4574..7824475f54 100644
> --- a/target/ppc/translate.c
> +++ b/target/ppc/translate.c
> @@ -177,6 +177,7 @@ struct DisasContext {
>  #if defined(TARGET_PPC64)
>      bool sf_mode;
>      bool has_cfar;
> +    bool has_bhrb;
>  #endif
>      bool fpu_enabled;
>      bool altivec_enabled;
> @@ -4104,12 +4105,100 @@ static void gen_rvwinkle(DisasContext *ctx)
>  }
>  #endif /* #if defined(TARGET_PPC64) */
>  
> -static inline void gen_update_cfar(DisasContext *ctx, target_ulong nip)
> +
> +static inline void gen_update_branch_history(DisasContext *ctx,
> +                                             target_ulong nip,
> +                                             TCGv target,
> +                                             target_long inst_type)
>  {
>  #if defined(TARGET_PPC64)
> +    TCGv t0;
> +    TCGv t1;
> +    TCGv t2;
> +    TCGv t3;
> +    TCGLabel *no_update;
> +
>      if (ctx->has_cfar) {
>          tcg_gen_movi_tl(cpu_cfar, nip);
>      }
> +
> +    if (!ctx->has_bhrb || inst_type == BHRB_TYPE_NORECORD) {
> +        return;
> +    }
> +
> +    /* ISA 3.1 adds the PMCRA[BRHBRD] and problem state checks */
> +    if ((ctx->insns_flags2 & PPC2_ISA310) && (ctx->mmcra_bhrbrd || 
> !ctx->pr)) {
> +        return;
> +    }
> +
> +    /* Check for BHRB "frozen" conditions */
> +    if (ctx->mmcr0_fcpc) {
> +        if (ctx->mmcr0_fcp) {
> +            if ((ctx->hv) && (ctx->pr)) {
> +                return;
> +            }
> +        } else if (!(ctx->hv) && (ctx->pr)) {
> +            return;
> +        }
> +    } else if ((ctx->mmcr0_fcp) && (ctx->pr)) {
> +        return;
> +    }
> +
> +    t0 = tcg_temp_new();
> +    t1 = tcg_temp_new();
> +    t2 = tcg_temp_new();
> +    t3 = tcg_temp_new();
> +    no_update = gen_new_label();
> +
> +    /* check for bhrb filtering */
> +    tcg_gen_ld_tl(t0, cpu_env, offsetof(CPUPPCState, bhrb_filter));
> +    tcg_gen_andi_tl(t0, t0, inst_type);
> +    tcg_gen_brcondi_tl(TCG_COND_EQ, t0, 0, no_update);
> +
> +    /* load bhrb base address into t2 */
> +    tcg_gen_ld_tl(t2, cpu_env, offsetof(CPUPPCState, bhrb_base));
> +
> +    /* load current bhrb_offset into t0 */
> +    tcg_gen_ld_tl(t0, cpu_env, offsetof(CPUPPCState, bhrb_offset));
> +
> +    /* load a BHRB offset mask into t3 */
> +    tcg_gen_ld_tl(t3, cpu_env, offsetof(CPUPPCState, bhrb_offset_mask));
> +
> +    /* add t2 to t0 to get address of bhrb entry */
> +    tcg_gen_add_tl(t1, t0, t2);
> +
> +    /* store nip into bhrb at bhrb_offset */
> +    tcg_gen_st_i64(tcg_constant_i64(nip), (TCGv_ptr)t1, 0);
> +
> +    /* add 8 to current bhrb_offset */
> +    tcg_gen_addi_tl(t0, t0, 8);
> +
> +    /* apply offset mask */
> +    tcg_gen_and_tl(t0, t0, t3);

For some cases where the value remains live for a time and not reused,
maybe use a more descriptive name? I know existing code is pretty lazy
with names (and I share some blame). t0,t2,t3 could be base, offset,
mask respectively, and t1 could be tmp that is used for filter and
address generation. Just a suggestion?


> +
> +    if (inst_type & BHRB_TYPE_XL_FORM) {
> +        /* Also record the target address for XL-Form branches */
> +
> +        /* add t2 to t0 to get address of bhrb entry */
> +        tcg_gen_add_tl(t1, t0, t2);
> +
> +        /* Set the 'T' bit for target entries */
> +        tcg_gen_ori_tl(t2, target, 0x2);
> +
> +        /* Store target address in bhrb */
> +        tcg_gen_st_tl(t2, (TCGv_ptr)t1, 0);
> +
> +        /* add 8 to current bhrb_offset */
> +        tcg_gen_addi_tl(t0, t0, 8);
> +
> +        /* apply offset mask */
> +        tcg_gen_and_tl(t0, t0, t3);
> +    }

I would say this is bordering on warranting a new function since it
mostly duplicates previous block. You could have it take base, mask,
offset and value to store, and have it return new offset.

> +
> +    /* save updated bhrb_offset for next time */
> +    tcg_gen_st_tl(t0, cpu_env, offsetof(CPUPPCState, bhrb_offset));
> +
> +    gen_set_label(no_update);
>  #endif
>  }

It all looks pretty good otherwise. I do worry about POWER8/9 which
do not have BHRB disable bit. How much do they slow down I wonder?

Thanks,
Nick



reply via email to

[Prev in Thread] Current Thread [Next in Thread]