qemu-ppc
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH 2/4] target/ppc: Add recording of taken branches to BHRB


From: Glenn Miles
Subject: Re: [PATCH 2/4] target/ppc: Add recording of taken branches to BHRB
Date: Tue, 19 Sep 2023 16:35:59 -0500

On 2023-09-14 20:02, Nicholas Piggin wrote:
On Wed Sep 13, 2023 at 6:24 AM AEST, Glenn Miles wrote:
This commit continues adding support for the Branch History
Rolling Buffer (BHRB) as is provided starting with the P8
processor and continuing with its successors.  This commit
is limited to the recording and filtering of taken branches.

The following changes were made:

  - Added a BHRB buffer for storing branch instruction and
    target addresses for taken branches
  - Renamed gen_update_cfar to gen_update_branch_history and
    added a 'target' parameter to hold the branch target
    address and 'inst_type' parameter to use for filtering
  - Added a combination of jit-time and run-time checks to
    gen_update_branch_history for determining if a branch
    should be recorded
  - Added TCG code to gen_update_branch_history that stores
    data to the BHRB and updates the BHRB offset.
  - Added BHRB resource initialization and reset functions
  - Enabled functionality for P8, P9 and P10 processors.

Signed-off-by: Glenn Miles <milesg@linux.vnet.ibm.com>
---
 target/ppc/cpu.h                       |  18 +++-
 target/ppc/cpu_init.c                  |  41 ++++++++-
 target/ppc/helper_regs.c               |  32 +++++++
 target/ppc/helper_regs.h               |   1 +
 target/ppc/power8-pmu.c                |   2 +
 target/ppc/power8-pmu.h                |   7 ++
target/ppc/translate.c | 114 +++++++++++++++++++++++--
 target/ppc/translate/branch-impl.c.inc |   2 +-
 8 files changed, 205 insertions(+), 12 deletions(-)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 20ae1466a5..bda1afb700 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -454,8 +454,9 @@ FIELD(MSR, LE, MSR_LE, 1)
 #define MMCR2_UREG_MASK (MMCR2_FC1P0 | MMCR2_FC2P0 | MMCR2_FC3P0 | \
                          MMCR2_FC4P0 | MMCR2_FC5P0 | MMCR2_FC6P0)

-#define MMCRA_BHRBRD PPC_BIT(26) /* BHRB Recording Disable */
-
+#define MMCRA_BHRBRD PPC_BIT(26) /* BHRB Recording Disable */

Fold this tidying into patch 1.

Done.


+#define MMCRA_IFM_MASK PPC_BITMASK(32, 33) /* BHRB Instruction Filtering */
+#define MMCRA_IFM_SHIFT PPC_BIT_NR(33)

 #define MMCR1_EVT_SIZE 8
 /* extract64() does a right shift before extracting */
@@ -682,6 +683,8 @@ enum {
     POWERPC_FLAG_SMT      = 0x00400000,
/* Using "LPAR per core" mode (as opposed to per-thread) */
     POWERPC_FLAG_SMT_1LPAR = 0x00800000,
+    /* Has BHRB */
+    POWERPC_FLAG_BHRB      = 0x01000000,
 };

Interesting question of which patch to add different flags. I'm
strongly in add when you add code that uses them like this one,
but it's a matter of taste and not always practical to be an
absolute rule. I don't mind too much what others do, but maybe
this and the pcc->flags init should go in patch 1 since that's adding
flags that aren't yet used?


I think I prefer keeping it in patch 2 since patch 1 is more about
the hflags, which seems unrelated to this flag.


 /*
@@ -1110,6 +1113,9 @@ DEXCR_ASPECT(PHIE, 6)
 #define PPC_CPU_OPCODES_LEN          0x40
 #define PPC_CPU_INDIRECT_OPCODES_LEN 0x20

+#define BHRB_MAX_NUM_ENTRIES_LOG2 (5)
+#define BHRB_MAX_NUM_ENTRIES      (1 << BHRB_MAX_NUM_ENTRIES_LOG2)
+
 struct CPUArchState {
/* Most commonly used resources during translated code execution first */
     target_ulong gpr[32];  /* general purpose registers */
@@ -1196,6 +1202,14 @@ struct CPUArchState {
     int dcache_line_size;
     int icache_line_size;

+    /* Branch History Rolling Buffer (BHRB) resources */
+    target_ulong bhrb_num_entries;
+    target_ulong bhrb_base;
+    target_ulong bhrb_filter;
+    target_ulong bhrb_offset;
+    target_ulong bhrb_offset_mask;
+    uint64_t bhrb[BHRB_MAX_NUM_ENTRIES];

Put these under ifdef TARGET_PPC64?


Ok.

+
     /* These resources are used during exception processing */
     /* CPU model definition */
     target_ulong msr_mask;
diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index 568f9c3b88..19d7505a73 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -6100,6 +6100,28 @@ POWERPC_FAMILY(POWER7)(ObjectClass *oc, void *data)
     pcc->l1_icache_size = 0x8000;
 }

+static void bhrb_init_state(CPUPPCState *env, target_long num_entries_log2)
+{
+    if (env->flags & POWERPC_FLAG_BHRB) {
+        if (num_entries_log2 > BHRB_MAX_NUM_ENTRIES_LOG2) {
+            num_entries_log2 = BHRB_MAX_NUM_ENTRIES_LOG2;
+        }
+        env->bhrb_num_entries = 1 << num_entries_log2;
+        env->bhrb_base = (target_long)&env->bhrb[0];
+ env->bhrb_offset_mask = (env->bhrb_num_entries * sizeof(uint64_t)) - 1;
+    }
+}
+
+static void bhrb_reset_state(CPUPPCState *env)
+{
+    if (env->flags & POWERPC_FLAG_BHRB) {
+        env->bhrb_offset = 0;
+        env->bhrb_filter = 0;
+        memset(env->bhrb, 0, sizeof(env->bhrb));
+    }
+}
+
+#define POWER8_BHRB_ENTRIES_LOG2 5
 static void init_proc_POWER8(CPUPPCState *env)
 {
     /* Common Registers */
@@ -6141,6 +6163,8 @@ static void init_proc_POWER8(CPUPPCState *env)
     env->dcache_line_size = 128;
     env->icache_line_size = 128;

+    bhrb_init_state(env, POWER8_BHRB_ENTRIES_LOG2);
+
     /* Allocate hardware IRQ controller */
     init_excp_POWER8(env);
     ppcPOWER7_irq_init(env_archcpu(env));
@@ -6241,7 +6265,8 @@ POWERPC_FAMILY(POWER8)(ObjectClass *oc, void *data)
     pcc->flags = POWERPC_FLAG_VRE | POWERPC_FLAG_SE |
                  POWERPC_FLAG_BE | POWERPC_FLAG_PMM |
                  POWERPC_FLAG_BUS_CLK | POWERPC_FLAG_CFAR |
-                 POWERPC_FLAG_VSX | POWERPC_FLAG_TM;
+                 POWERPC_FLAG_VSX | POWERPC_FLAG_TM |
+                 POWERPC_FLAG_BHRB;
     pcc->l1_dcache_size = 0x8000;
     pcc->l1_icache_size = 0x8000;
 }
@@ -6265,6 +6290,7 @@ static struct ppc_radix_page_info POWER9_radix_page_info = {
 };
 #endif /* CONFIG_USER_ONLY */

+#define POWER9_BHRB_ENTRIES_LOG2 5
 static void init_proc_POWER9(CPUPPCState *env)
 {
     /* Common Registers */
@@ -6315,6 +6341,8 @@ static void init_proc_POWER9(CPUPPCState *env)
     env->dcache_line_size = 128;
     env->icache_line_size = 128;

+    bhrb_init_state(env, POWER9_BHRB_ENTRIES_LOG2);
+
     /* Allocate hardware IRQ controller */
     init_excp_POWER9(env);
     ppcPOWER9_irq_init(env_archcpu(env));
@@ -6434,7 +6462,8 @@ POWERPC_FAMILY(POWER9)(ObjectClass *oc, void *data)
     pcc->flags = POWERPC_FLAG_VRE | POWERPC_FLAG_SE |
                  POWERPC_FLAG_BE | POWERPC_FLAG_PMM |
                  POWERPC_FLAG_BUS_CLK | POWERPC_FLAG_CFAR |
- POWERPC_FLAG_VSX | POWERPC_FLAG_TM | POWERPC_FLAG_SCV; + POWERPC_FLAG_VSX | POWERPC_FLAG_TM | POWERPC_FLAG_SCV |
+                 POWERPC_FLAG_BHRB;
     pcc->l1_dcache_size = 0x8000;
     pcc->l1_icache_size = 0x8000;
 }
@@ -6458,6 +6487,7 @@ static struct ppc_radix_page_info POWER10_radix_page_info = {
 };
 #endif /* !CONFIG_USER_ONLY */

+#define POWER10_BHRB_ENTRIES_LOG2 5
 static void init_proc_POWER10(CPUPPCState *env)
 {
     /* Common Registers */
@@ -6505,6 +6535,8 @@ static void init_proc_POWER10(CPUPPCState *env)
     env->dcache_line_size = 128;
     env->icache_line_size = 128;

+    bhrb_init_state(env, POWER10_BHRB_ENTRIES_LOG2);
+
     /* Allocate hardware IRQ controller */
     init_excp_POWER10(env);
     ppcPOWER9_irq_init(env_archcpu(env));
@@ -6610,7 +6642,8 @@ POWERPC_FAMILY(POWER10)(ObjectClass *oc, void *data)
     pcc->flags = POWERPC_FLAG_VRE | POWERPC_FLAG_SE |
                  POWERPC_FLAG_BE | POWERPC_FLAG_PMM |
                  POWERPC_FLAG_BUS_CLK | POWERPC_FLAG_CFAR |
- POWERPC_FLAG_VSX | POWERPC_FLAG_TM | POWERPC_FLAG_SCV; + POWERPC_FLAG_VSX | POWERPC_FLAG_TM | POWERPC_FLAG_SCV |
+                 POWERPC_FLAG_BHRB;
     pcc->l1_dcache_size = 0x8000;
     pcc->l1_icache_size = 0x8000;
 }
@@ -7178,6 +7211,8 @@ static void ppc_cpu_reset_hold(Object *obj)
         }
         env->spr[i] = spr->default_value;
     }
+
+    bhrb_reset_state(env);
 }

 #ifndef CONFIG_USER_ONLY
diff --git a/target/ppc/helper_regs.c b/target/ppc/helper_regs.c
index 4ff054063d..68b27196d9 100644
--- a/target/ppc/helper_regs.c
+++ b/target/ppc/helper_regs.c
@@ -752,3 +752,35 @@ void register_usprgh_sprs(CPUPPCState *env)
                  &spr_read_ureg, SPR_NOACCESS,
                  0x00000000);
 }
+
+void hreg_bhrb_filter_update(CPUPPCState *env)
+{
+    target_long ifm;
+
+    if (!(env->spr[SPR_POWER_MMCR0] & MMCR0_PMAE)) {
+        /* disable recording to BHRB */
+        env->bhrb_filter = BHRB_TYPE_NORECORD;
+        return;
+    }
+
+ ifm = (env->spr[SPR_POWER_MMCRA] & MMCRA_IFM_MASK) >> MMCRA_IFM_SHIFT;
+    switch (ifm) {
+    case 0:
+        /* record all branches */
+        env->bhrb_filter = -1;
+        break;
+    case 1:
+        /* only record calls (LK = 1) */
+        env->bhrb_filter = BHRB_TYPE_CALL;
+        break;
+    case 2:
+        /* only record indirect branches */
+        env->bhrb_filter = BHRB_TYPE_INDIRECT;
+        break;
+    case 3:
+        /* only record conditional branches */
+        env->bhrb_filter = BHRB_TYPE_COND;
+        break;
+    }
+}
+
diff --git a/target/ppc/helper_regs.h b/target/ppc/helper_regs.h
index 8196c1346d..ab6aba1c24 100644
--- a/target/ppc/helper_regs.h
+++ b/target/ppc/helper_regs.h
@@ -25,6 +25,7 @@ void hreg_compute_hflags(CPUPPCState *env);
 void hreg_update_pmu_hflags(CPUPPCState *env);
 void cpu_interrupt_exittb(CPUState *cs);
int hreg_store_msr(CPUPPCState *env, target_ulong value, int alter_hv);
+void hreg_bhrb_filter_update(CPUPPCState *env);

 #ifdef CONFIG_USER_ONLY
 static inline void check_tlb_flush(CPUPPCState *env, bool global) { }
diff --git a/target/ppc/power8-pmu.c b/target/ppc/power8-pmu.c
index 6f5d4e1256..18149a15b2 100644
--- a/target/ppc/power8-pmu.c
+++ b/target/ppc/power8-pmu.c
@@ -261,6 +261,7 @@ void helper_store_mmcr0(CPUPPCState *env, target_ulong value)
     env->spr[SPR_POWER_MMCR0] = value;

     pmu_mmcr01a_updated(env);
+    hreg_bhrb_filter_update(env);

     /* Update cycle overflow timers with the current MMCR0 state */
     pmu_update_overflow_timers(env);
@@ -280,6 +281,7 @@ void helper_store_mmcrA(CPUPPCState *env, uint64_t value)
     env->spr[SPR_POWER_MMCRA] = value;

     pmu_mmcr01a_updated(env);
+    hreg_bhrb_filter_update(env);
 }


Can these just be moved into pmu_mmcr01a_updated? AFAIKS they only
depend on MMCR0/A.


Done.

 target_ulong helper_read_pmc(CPUPPCState *env, uint32_t sprn)
diff --git a/target/ppc/power8-pmu.h b/target/ppc/power8-pmu.h
index 87fa8c9334..a887094045 100644
--- a/target/ppc/power8-pmu.h
+++ b/target/ppc/power8-pmu.h
@@ -17,6 +17,13 @@

 #define PMC_COUNTER_NEGATIVE_VAL 0x80000000UL

+#define BHRB_TYPE_NORECORD      0x00
+#define BHRB_TYPE_CALL          0x01
+#define BHRB_TYPE_INDIRECT      0x02
+#define BHRB_TYPE_COND          0x04
+#define BHRB_TYPE_OTHER         0x08
+#define BHRB_TYPE_XL_FORM       0x10
+
 void cpu_ppc_pmu_init(CPUPPCState *env);
 void pmu_mmcr01a_updated(CPUPPCState *env);
 #else
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index d93fbd4574..7824475f54 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -177,6 +177,7 @@ struct DisasContext {
 #if defined(TARGET_PPC64)
     bool sf_mode;
     bool has_cfar;
+    bool has_bhrb;
 #endif
     bool fpu_enabled;
     bool altivec_enabled;
@@ -4104,12 +4105,100 @@ static void gen_rvwinkle(DisasContext *ctx)
 }
 #endif /* #if defined(TARGET_PPC64) */

-static inline void gen_update_cfar(DisasContext *ctx, target_ulong nip)
+
+static inline void gen_update_branch_history(DisasContext *ctx,
+                                             target_ulong nip,
+                                             TCGv target,
+                                             target_long inst_type)
 {
 #if defined(TARGET_PPC64)
+    TCGv t0;
+    TCGv t1;
+    TCGv t2;
+    TCGv t3;
+    TCGLabel *no_update;
+
     if (ctx->has_cfar) {
         tcg_gen_movi_tl(cpu_cfar, nip);
     }
+
+    if (!ctx->has_bhrb || inst_type == BHRB_TYPE_NORECORD) {
+        return;
+    }
+
+    /* ISA 3.1 adds the PMCRA[BRHBRD] and problem state checks */
+ if ((ctx->insns_flags2 & PPC2_ISA310) && (ctx->mmcra_bhrbrd || !ctx->pr)) {
+        return;
+    }
+
+    /* Check for BHRB "frozen" conditions */
+    if (ctx->mmcr0_fcpc) {
+        if (ctx->mmcr0_fcp) {
+            if ((ctx->hv) && (ctx->pr)) {
+                return;
+            }
+        } else if (!(ctx->hv) && (ctx->pr)) {
+            return;
+        }
+    } else if ((ctx->mmcr0_fcp) && (ctx->pr)) {
+        return;
+    }
+
+    t0 = tcg_temp_new();
+    t1 = tcg_temp_new();
+    t2 = tcg_temp_new();
+    t3 = tcg_temp_new();
+    no_update = gen_new_label();
+
+    /* check for bhrb filtering */
+    tcg_gen_ld_tl(t0, cpu_env, offsetof(CPUPPCState, bhrb_filter));
+    tcg_gen_andi_tl(t0, t0, inst_type);
+    tcg_gen_brcondi_tl(TCG_COND_EQ, t0, 0, no_update);
+
+    /* load bhrb base address into t2 */
+    tcg_gen_ld_tl(t2, cpu_env, offsetof(CPUPPCState, bhrb_base));
+
+    /* load current bhrb_offset into t0 */
+    tcg_gen_ld_tl(t0, cpu_env, offsetof(CPUPPCState, bhrb_offset));
+
+    /* load a BHRB offset mask into t3 */
+ tcg_gen_ld_tl(t3, cpu_env, offsetof(CPUPPCState, bhrb_offset_mask));
+
+    /* add t2 to t0 to get address of bhrb entry */
+    tcg_gen_add_tl(t1, t0, t2);
+
+    /* store nip into bhrb at bhrb_offset */
+    tcg_gen_st_i64(tcg_constant_i64(nip), (TCGv_ptr)t1, 0);
+
+    /* add 8 to current bhrb_offset */
+    tcg_gen_addi_tl(t0, t0, 8);
+
+    /* apply offset mask */
+    tcg_gen_and_tl(t0, t0, t3);

For some cases where the value remains live for a time and not reused,
maybe use a more descriptive name? I know existing code is pretty lazy
with names (and I share some blame). t0,t2,t3 could be base, offset,
mask respectively, and t1 could be tmp that is used for filter and
address generation. Just a suggestion?


Yeah, I agree.  That is normally what I would have done, but changed
my mind after seeing how everyone else seemed to be doing it :)

Changing for readability.


+
+    if (inst_type & BHRB_TYPE_XL_FORM) {
+        /* Also record the target address for XL-Form branches */
+
+        /* add t2 to t0 to get address of bhrb entry */
+        tcg_gen_add_tl(t1, t0, t2);
+
+        /* Set the 'T' bit for target entries */
+        tcg_gen_ori_tl(t2, target, 0x2);
+
+        /* Store target address in bhrb */
+        tcg_gen_st_tl(t2, (TCGv_ptr)t1, 0);
+
+        /* add 8 to current bhrb_offset */
+        tcg_gen_addi_tl(t0, t0, 8);
+
+        /* apply offset mask */
+        tcg_gen_and_tl(t0, t0, t3);
+    }

I would say this is bordering on warranting a new function since it
mostly duplicates previous block. You could have it take base, mask,
offset and value to store, and have it return new offset.


I agree.  Will make the changes.

+
+    /* save updated bhrb_offset for next time */
+    tcg_gen_st_tl(t0, cpu_env, offsetof(CPUPPCState, bhrb_offset));
+
+    gen_set_label(no_update);
 #endif
 }

It all looks pretty good otherwise. I do worry about POWER8/9 which
do not have BHRB disable bit. How much do they slow down I wonder?

That is a good question!  I'll see if I can get some linux boot times
with and without the changes on P9.


Thanks,
Nick

Thanks for taking the time to review my code!

Glenn



reply via email to

[Prev in Thread] Current Thread [Next in Thread]