[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [RFC PATCH] cputlb and ssi: cache class to avoid expensive object_dy
From: |
Alex Bennée |
Subject: |
Re: [RFC PATCH] cputlb and ssi: cache class to avoid expensive object_dynamic_cast_assert (HACK!!!) |
Date: |
Thu, 04 Aug 2022 17:51:27 +0100 |
User-agent: |
mu4e 1.7.27; emacs 28.1.91 |
Cédric Le Goater <clg@kaod.org> writes:
> Hello Alex,
>
> Thanks for putting some time into this problem,
>
> On 8/4/22 11:20, Alex Bennée wrote:
>> Investigating why some BMC models are so slow compared to a plain ARM
>> virt machines I did some profiling of:
>> ./qemu-system-arm -M romulus-bmc -nic user \
>> -drive
>> file=obmc-phosphor-image-romulus.static.mtd,format=raw,if=mtd \
>> -nographic -serial mon:stdio
>> And saw that object_dynamic_cast was dominating the profile times.
>> We
>> have a number of cases in the CPU hot path and more importantly for
>> this model in the SSI bus. As the class is static once the object is
>> created we just cache it and use it instead of the dynamic case
>> macros.
>> [AJB: I suspect a proper fix for this is for QOM to support a cached
>> class lookup, abortive macro attempt #if 0'd in this patch].
>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>> Cc: Cédric Le Goater <clg@kaod.org>
>
>
> Here are some results,
>
> * romulus-bmc, OpenBmc login prompt
>
> without : 82s
> with : 56s
Looks like I lucked out picking the lowest hanging fruit.
>
> * ast2500-evb,execute-in-place=true, U-boot 2019.04 prompt
>
> without : 30s
> with : 22s
>
> * witherspoon-bmc,execute-in-place=true, U-boot 2016.07 prompt
>
> without : 5.5s
> with : 4.1s
>
> There is definitely an improvement in all scenarios.
>
> Applying a similar technique on AspeedSMCClass, I could gain
> another ~10% and boot the ast2500-evb,execute-in-place=true
> machine, up to the U-boot 2019.04 prompt, in less then 20s.
There are some fundamentals to XIP which means they will be slower if
each instruction is being sucked through io_readx/device emulation
although I'm not sure what the exact mechanism is because surely a ROM
can just be mapped into the address space and run from there?
> However, newer u-boot are still quite slow to boot when executing
> from the flash device.
For any of those machines? Whats the next command line for me to dig
into?
>
> Thanks,
>
> C.
>
>> ---
>> include/hw/core/cpu.h | 2 ++
>> include/hw/ssi/ssi.h | 3 +++
>> include/qom/object.h | 29 +++++++++++++++++++++++++++++
>> accel/tcg/cputlb.c | 12 ++++++++----
>> hw/ssi/ssi.c | 10 +++++++---
>> 5 files changed, 49 insertions(+), 7 deletions(-)
>> diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
>> index 500503da13..70027a772e 100644
>> --- a/include/hw/core/cpu.h
>> +++ b/include/hw/core/cpu.h
>> @@ -317,6 +317,8 @@ struct qemu_work_item;
>> struct CPUState {
>> /*< private >*/
>> DeviceState parent_obj;
>> + /* cache to avoid expensive CPU_GET_CLASS */
>> + CPUClass *cc;
>> /*< public >*/
>> int nr_cores;
>> diff --git a/include/hw/ssi/ssi.h b/include/hw/ssi/ssi.h
>> index f411858ab0..6950f86810 100644
>> --- a/include/hw/ssi/ssi.h
>> +++ b/include/hw/ssi/ssi.h
>> @@ -59,6 +59,9 @@ struct SSIPeripheralClass {
>> struct SSIPeripheral {
>> DeviceState parent_obj;
>> + /* cache the class */
>> + SSIPeripheralClass *spc;
>> +
>> /* Chip select state */
>> bool cs;
>> };
>> diff --git a/include/qom/object.h b/include/qom/object.h
>> index ef7258a5e1..2202dbfa43 100644
>> --- a/include/qom/object.h
>> +++ b/include/qom/object.h
>> @@ -198,6 +198,35 @@ struct Object
>> OBJ_NAME##_CLASS(const void *klass) \
>> { return OBJECT_CLASS_CHECK(ClassType, klass, TYPENAME); }
>> +#if 0
>> +/**
>> + * DECLARE_CACHED_CLASS_CHECKER:
>> + * @InstanceType: instance struct name
>> + * @ClassType: class struct name
>> + * @OBJ_NAME: the object name in uppercase with underscore separators
>> + * @TYPENAME: type name
>> + *
>> + * This variant of DECLARE_CLASS_CHECKERS allows for the caching of
>> + * class in the parent object instance. This is useful for very hot
>> + * path code at the expense of an extra indirection and check. As per
>> + * the original direct usage of this macro should be avoided if the
>> + * complete OBJECT_DECLARE_TYPE macro has been used.
>> + *
>> + * This macro will provide the class type cast functions for a
>> + * QOM type.
>> + */
>> +#define DECLARE_CACHED_CLASS_CHECKERS(InstanceType, ClassType, OBJ_NAME,
>> TYPENAME) \
>> + DECLARE_CLASS_CHECKERS(ClassType, OBJ_NAME, TYPENAME) \
>> + static inline G_GNUC_UNUSED ClassType * \
>> + OBJ_NAME##_GET_CACHED_CLASS(const void *obj) \
>> + { \
>> + InstanceType *p = (InstanceType *) obj; \
>> + p->cc = p->cc ? p->cc : OBJECT_GET_CLASS(ClassType, obj, TYPENAME);\
>> + return p->cc; \
>> + }
>> +
>> +#endif
>> +
>> /**
>> * DECLARE_OBJ_CHECKERS:
>> * @InstanceType: instance struct name
>> diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
>> index a46f3a654d..882315f7dd 100644
>> --- a/accel/tcg/cputlb.c
>> +++ b/accel/tcg/cputlb.c
>> @@ -1303,8 +1303,9 @@ static inline ram_addr_t
>> qemu_ram_addr_from_host_nofail(void *ptr)
>> static void tlb_fill(CPUState *cpu, target_ulong addr, int size,
>> MMUAccessType access_type, int mmu_idx, uintptr_t
>> retaddr)
>> {
>> - CPUClass *cc = CPU_GET_CLASS(cpu);
>> + CPUClass *cc = cpu->cc ? cpu->cc : CPU_GET_CLASS(cpu);
>> bool ok;
>> + cpu->cc = cc;
>> /*
>> * This is not a probe, so only valid return is success; failure
>> @@ -1319,7 +1320,8 @@ static inline void cpu_unaligned_access(CPUState *cpu,
>> vaddr addr,
>> MMUAccessType access_type,
>> int mmu_idx, uintptr_t retaddr)
>> {
>> - CPUClass *cc = CPU_GET_CLASS(cpu);
>> + CPUClass *cc = cpu->cc ? cpu->cc : CPU_GET_CLASS(cpu);
>> + cpu->cc = cc;
>> cc->tcg_ops->do_unaligned_access(cpu, addr, access_type,
>> mmu_idx, retaddr);
>> }
>> @@ -1331,7 +1333,8 @@ static inline void cpu_transaction_failed(CPUState
>> *cpu, hwaddr physaddr,
>> MemTxResult response,
>> uintptr_t retaddr)
>> {
>> - CPUClass *cc = CPU_GET_CLASS(cpu);
>> + CPUClass *cc = cpu->cc ? cpu->cc : CPU_GET_CLASS(cpu);
>> + cpu->cc = cc;
>> if (!cpu->ignore_memory_transaction_failures &&
>> cc->tcg_ops->do_transaction_failed) {
>> @@ -1606,7 +1609,8 @@ static int probe_access_internal(CPUArchState *env,
>> target_ulong addr,
>> if (!tlb_hit_page(tlb_addr, page_addr)) {
>> if (!victim_tlb_hit(env, mmu_idx, index, elt_ofs, page_addr)) {
>> CPUState *cs = env_cpu(env);
>> - CPUClass *cc = CPU_GET_CLASS(cs);
>> + CPUClass *cc = cs->cc ? cs->cc : CPU_GET_CLASS(cs);
>> + cs->cc = cc;
>> if (!cc->tcg_ops->tlb_fill(cs, addr, fault_size,
>> access_type,
>> mmu_idx, nonfault, retaddr)) {
>> diff --git a/hw/ssi/ssi.c b/hw/ssi/ssi.c
>> index 003931fb50..f749feb6e3 100644
>> --- a/hw/ssi/ssi.c
>> +++ b/hw/ssi/ssi.c
>> @@ -38,7 +38,8 @@ static void ssi_cs_default(void *opaque, int n, int level)
>> bool cs = !!level;
>> assert(n == 0);
>> if (s->cs != cs) {
>> - SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(s);
>> + /* SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(s); */
>> + SSIPeripheralClass *ssc = s->spc;
>> if (ssc->set_cs) {
>> ssc->set_cs(s, cs);
>> }
>> @@ -48,7 +49,8 @@ static void ssi_cs_default(void *opaque, int n, int level)
>> static uint32_t ssi_transfer_raw_default(SSIPeripheral *dev,
>> uint32_t val)
>> {
>> - SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(dev);
>> + /* SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(dev); */
>> + SSIPeripheralClass *ssc = dev->spc;
>> if ((dev->cs && ssc->cs_polarity == SSI_CS_HIGH) ||
>> (!dev->cs && ssc->cs_polarity == SSI_CS_LOW) ||
>> @@ -67,6 +69,7 @@ static void ssi_peripheral_realize(DeviceState *dev, Error
>> **errp)
>> ssc->cs_polarity != SSI_CS_NONE) {
>> qdev_init_gpio_in_named(dev, ssi_cs_default, SSI_GPIO_CS, 1);
>> }
>> + s->spc = ssc;
>> ssc->realize(s, errp);
>> }
>> @@ -120,7 +123,8 @@ uint32_t ssi_transfer(SSIBus *bus, uint32_t val)
>> QTAILQ_FOREACH(kid, &b->children, sibling) {
>> SSIPeripheral *peripheral = SSI_PERIPHERAL(kid->child);
>> - ssc = SSI_PERIPHERAL_GET_CLASS(peripheral);
>> + /* ssc = SSI_PERIPHERAL_GET_CLASS(peripheral); */
>> + ssc = peripheral->spc;
>> r |= ssc->transfer_raw(peripheral, val);
>> }
>>
--
Alex Bennée