lightning
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH] ppc: Add optimization to generate endian-swapping load opcod


From: Paul Cercueil
Subject: Re: [PATCH] ppc: Add optimization to generate endian-swapping load opcodes
Date: Thu, 11 Aug 2022 20:41:20 +0200



Le jeu., août 11 2022 at 14:40:57 -0300, Paulo César Pereira de Andrade <paulo.cesar.pereira.de.andrade@gmail.com> a écrit :
Em qui., 11 de ago. de 2022 às 14:18, Paul Cercueil
<paul@crapouillou.net> escreveu:

 Hi Paulo,

 Le jeu., août 11 2022 at 13:59:47 -0300, Paulo César Pereira de
 Andrade <paulo.cesar.pereira.de.andrade@gmail.com> a écrit :
 > Em qua., 10 de ago. de 2022 às 13:28, Paul Cercueil
 > <paul@crapouillou.net> escreveu:
 >>
 >>  When generating endian-swap code with jit_bswapr_us() or
>> jit_bswapr_ui(), look at the last opcode generated. If it's a LHZX
 >> or
>> LWZX, it can be replaced with a LHBRX or LWBRX opcode, which will
 >>  perform the endian swap directly.
 >>
 >>  If it's a LHZ or LWZ, then we can just move the immediate value
 >> into a
 >>  temporary register, and emit a LHBRX or LWBRX opcode.
 >>
>> The "no flag" check is needed, because we must ensure that no branch >> will jump between the load opcode that we want to modify, and the
 >>  jit_bswapr opcode.
 >
 >   Not certain if I understand the need for the no_flag. It only
 > backtracks one
> instruction, and that cannot be a jump for the pattern used. Please
 > confirm it
 > is for the case:
 >
 > LABEL:
 >   ldr_ui %r0 %r1
 >   bswapr_ui %r0 %r0
 >
 > where the ldr_ui might be a jump target.

Actually that case would be fine. It's for the case where the LABEL is between the ldr_ui and the bswapr_ui. I do check the previous opcode,
 because that's where the jit_flag_patch seems to be.

I see. The logic is just checking if the previous node is a label or an
epilog implicit patch.

 >   Either way, basically it is implementing a load&byte_swap in a
 > single
 > instruction.

 Correct.

 >>  Signed-off-by: Paul Cercueil <paul@crapouillou.net>
 >>  ---
 >>   lib/jit_ppc-cpu.c | 63
 >> +++++++++++++++++++++++++++++++++++++++++------
 >>   lib/jit_ppc.c     | 11 +++++++--
 >>   2 files changed, 64 insertions(+), 10 deletions(-)
 >>
 >>  diff --git a/lib/jit_ppc-cpu.c b/lib/jit_ppc-cpu.c
 >>  index cab085f..3769a61 100644
 >>  --- a/lib/jit_ppc-cpu.c
 >>  +++ b/lib/jit_ppc-cpu.c
 >>  @@ -260,7 +260,7 @@ static void
 >> _FXS(jit_state_t*,int,int,int,int,int,int,int);
 >>   #  define LHAU(d,a,s)                  FDs(43,d,a,s)
 >>   #  define LHAUX(d,a,b)                 FX(31,d,a,b,375)
 >>   #  define LHAX(d,a,b)                  FX(31,d,a,b,343)
 >>  -#  define LHRBX(d,a,b)                 FX(31,d,a,b,790)
 >>  +#  define LHBRX(d,a,b)                 FX(31,d,a,b,790)
 >>   #  define LHZ(d,a,s)                   FDs(40,d,a,s)
 >>   #  define LHZU(d,a,s)                  FDs(41,d,a,s)
 >>   #  define LHZUX(d,a,b)                 FX(31,d,a,b,311)
 >>  @@ -521,10 +521,12 @@ static jit_word_t
 >> _movi_p(jit_state_t*,jit_int32_t,jit_word_t);
 >>   #    define extr_i(r0,r1)              EXTSW(r0,r1)
 >>   #    define extr_ui(r0,r1)             CLRLDI(r0,r1,32)
 >>   #  endif
 >>  -#  define bswapr_us(r0,r1)             _bswapr_us(_jit,r0,r1)
 >>  -static void _bswapr_us(jit_state_t*,jit_int32_t,jit_int32_t);
 >>  -#  define bswapr_ui(r0,r1)             _bswapr_ui(_jit,r0,r1)
 >>  -static void _bswapr_ui(jit_state_t*,jit_int32_t,jit_int32_t);
 >>  +#  define bswapr_us_lh(r0,r1,no_flag)
 >> _bswapr_us(_jit,r0,r1,no_flag)
 >>  +#  define bswapr_us(r0,r1)             _bswapr_us(_jit,r0,r1,0)
 >>  +static void
 >> _bswapr_us(jit_state_t*,jit_int32_t,jit_int32_t,jit_bool_t);
 >>  +#  define bswapr_ui_lw(r0,r1,no_flag)
 >> _bswapr_ui(_jit,r0,r1,no_flag)
 >>  +#  define bswapr_ui(r0,r1)             _bswapr_ui(_jit,r0,r1,0)
 >>  +static void
 >> _bswapr_ui(jit_state_t*,jit_int32_t,jit_int32_t,jit_bool_t);
 >>   #  if __WORDSIZE == 64
 >>   #    define bswapr_ul(r0,r1)
 >> generic_bswapr_ul(_jit,r0,r1)
 >>   #  endif
>> @@ -1148,8 +1150,31 @@ _movi_p(jit_state_t *_jit, jit_int32_t r0,
 >> jit_word_t i0)
 >>   }
 >>
 >>   static void
 >>  -_bswapr_us(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
 >>  +_bswapr_us(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
 >> jit_bool_t no_flag)
 >>   {
 >>  +    jit_int32_t                reg, addr_reg;
 >>  +
 >>  +    if (no_flag && r0 == r1) {
>> + if ((*(_jit->pc.ui - 1) & 0xffe007ff) == (0x7c00022e | r0
 >> << 21)) {
 >>  +            /* Convert LHZX to LHBRX */
 >>  +            _jit->pc.ui--;
>> + LHBRX(r0, (*_jit->pc.ui >> 16) & 0x1f, (*_jit->pc.ui
 >> >> 11) & 0x1f);
 >>  +            return;
 >>  +        }
 >>  +
>> + if ((*(_jit->pc.ui - 1) & 0xffe00000) == (0xa0000000 | r0
 >> << 21)) {
 >>  +            /* Convert LHZ to LHBRX */
 >>  +            _jit->pc.ui--;
 >>  +            addr_reg = (*_jit->pc.ui >> 16) & 0x1f;
 >>  +
 >>  +            reg = jit_get_reg(jit_class_gpr);
 >>  +            LI(rn(reg), (short)*_jit->pc.ui);
 >>  +            LHBRX(r0, rn(reg), addr_reg);
 >>  +            jit_unget_reg(reg);
 >>  +            return;
 >>  +        }
 >>  +    }
 >>  +
 >>       if (r0 == r1) {
 >>           RLWIMI(r0, r0, 16, 8, 15);
 >>           RLWINM(r0, r0, 24, 16, 31);
 >>  @@ -1160,9 +1185,31 @@ _bswapr_us(jit_state_t *_jit, jit_int32_t
 >> r0, jit_int32_t r1)
 >>   }
 >>
 >>   static void
 >>  -_bswapr_ui(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1)
 >>  +_bswapr_ui(jit_state_t *_jit, jit_int32_t r0, jit_int32_t r1,
 >> jit_bool_t no_flag)
 >>   {
 >>  -    jit_int32_t                reg;
 >>  +    jit_int32_t                reg, addr_reg;
 >>  +
 >>  +    if (no_flag && r0 == r1) {
>> + if ((*(_jit->pc.ui - 1) & 0xffe007ff) == (0x7c00002e | r0
 >> << 21)) {
 >>  +            /* Convert LWZX to LWBRX */
 >>  +            _jit->pc.ui--;
>> + LWBRX(r0, (*_jit->pc.ui >> 16) & 0x1f, (*_jit->pc.ui
 >> >> 11) & 0x1f);
 >>  +            return;
 >>  +        }
 >>  +
>> + if ((*(_jit->pc.ui - 1) & 0xffe00000) == (0x80000000 | r0
 >> << 21)) {
 >>  +            /* Convert LWZ to LWBRX */
 >>  +            _jit->pc.ui--;
 >>  +            addr_reg = (*_jit->pc.ui >> 16) & 0x1f;
 >>  +
 >>  +            reg = jit_get_reg(jit_class_gpr);
 >>  +            LI(rn(reg), (short)*_jit->pc.ui);
 >>  +            LWBRX(r0, rn(reg), addr_reg);
 >>  +            jit_unget_reg(reg);
 >>  +            return;
 >>  +        }
 >>  +    }
 >>  +
 >>       reg = jit_get_reg(jit_class_gpr);
 >>       ROTLWI(rn(reg), r1, 8);
 >>       RLWIMI(rn(reg), r1, 24, 0, 7);
 >
> These could be simplified a bit, or, at least have a single visible
 > exit
 > point of the function.

 I couldn't find a way to only use if/else blocks without having the
 default path duplicated...

 >>  diff --git a/lib/jit_ppc.c b/lib/jit_ppc.c
 >>  index e94d1a5..28975be 100644
 >>  --- a/lib/jit_ppc.c
 >>  +++ b/lib/jit_ppc.c
 >>  @@ -1148,6 +1148,7 @@ _emit_code(jit_state_t *_jit)
 >>       jit_word_t          word;
 >>       jit_int32_t                 value;
 >>       jit_int32_t                 offset;
 >>  +    jit_bool_t       no_flag = 0;
 >>       struct {
 >>          jit_node_t      *node;
 >>          jit_word_t       word;
 >>  @@ -1356,8 +1357,12 @@ _emit_code(jit_state_t *_jit)
 >>   #  if __WORDSIZE == 64
 >>                  case_rr(hton, _ul);
 >>   #  endif
 >>  -               case_rr(bswap, _us);
 >>  -               case_rr(bswap, _ui);
 >>  +           case jit_code_bswapr_us:
>> + bswapr_us_lh(rn(node->u.w), rn(node->v.w), no_flag);
 >>  +               break;
 >>  +           case jit_code_bswapr_ui:
>> + bswapr_ui_lw(rn(node->u.w), rn(node->v.w), no_flag);
 >>  +               break;
 >>   #  if __WORDSIZE == 64
 >>                  case_rr(bswap, _ul);
 >>   #  endif
 >>  @@ -1823,6 +1828,8 @@ _emit_code(jit_state_t *_jit)
 >>          assert(_jitc->regarg == 0 && _jitc->synth == 0);
 >>          /* update register live state */
 >>          jit_reglive(node);
 >>  +
 >>  +        no_flag = !(node->flag & jit_flag_patch);
 >>       }
 >>   #undef case_brf
 >>   #undef case_brw
 >>  --
 >>  2.35.1
 >
> Please only clarify about the no_flag and the patch can be applied,
 > possibly
 > slightly modified based on comments below.
 >
 >   Should also add some comments about what it is doing.

You mean for no_flag? Because I did write the comment "Convert LWZ to
 LWBRX" :)

  Actually, more likely a comment about doing it in a backend specific
optimization pass, to cover cases like this.

One last question is, will it work as expected in big endian and little
endian? Also, will it work in 32 and 64 bit? Before doing a release I
would check in actual hardware or virtual machines all backends and
validate it, but better to not let things like this to be verified very later.

I tested on big-endian but I don't see a reason why it wouldn't work on little-endian as well - the LHBRX and LWBRX opcode byte-swap the result value independently of the CPU endianness.

I tested on PPC32, but 64-bit should work as well; I will check on little-endian PPC64 to be sure.

Cheers,
-Paul

> If some new kind of optimization pass is done at some point, adding
 > backend
> specific codes, this could be extended to work on more complex cases,
 > where
> a temporary was already used to do the previous instruction, and it
 > was required
> a spill/reload, if not required a spill, the temporary will work the
 > same way when
 > replacing the instruction.

Honestly I'm fine with the current handling. Trying to do better might
 end making it too complex.

Similarly for my delay-slot-filling patch on MIPS, I only swap the jump
 opcode with the previous instruction if possible, I don't backtrack
 further to find an instruction that can be used.

 Cheers,
 -Paul







reply via email to

[Prev in Thread] Current Thread [Next in Thread]