[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH 09/18] i386: Destructive vector helpers for AVX
From: |
Richard Henderson |
Subject: |
Re: [PATCH 09/18] i386: Destructive vector helpers for AVX |
Date: |
Thu, 25 Aug 2022 17:41:17 -0700 |
User-agent: |
Mozilla/5.0 (X11; Linux x86_64; rv:91.0) Gecko/20100101 Thunderbird/91.11.0 |
On 8/25/22 15:14, Paolo Bonzini wrote:
void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order)
{
- Reg r;
+ uint32_t r0, r1, r2, r3;
- r.L(0) = s->L(order & 3);
- r.L(1) = s->L((order >> 2) & 3);
- r.L(2) = s->L((order >> 4) & 3);
- r.L(3) = s->L((order >> 6) & 3);
- MOVE(*d, r);
+ SHUFFLE4(L, s, s, 0);
+#if SHIFT == 2
+ SHUFFLE4(L, s, s, 4);
+#endif
}
Why the if and not a loop?
for (int offset = 0; offset < 2 << SHIFT; offset += 4) {
SHUFFLE4(L, s, s, offset);
}
Most of the other shuffles haven't been updated for ymm.
Perhaps this too should be deferred to the future patch?
+#if SHIFT == 0
+#define PACK_WIDTH 4
+#else
+#define PACK_WIDTH 8
+#endif
....
+#define PACK4(F, to, reg, from) do { \
+ r[to + 0] = F((int16_t)reg->W(from + 0)); \
+ r[to + 1] = F((int16_t)reg->W(from + 1)); \
+ r[to + 2] = F((int16_t)reg->W(from + 2)); \
+ r[to + 3] = F((int16_t)reg->W(from + 3)); \
+ } while (0)
...
+
+#define PACK_HELPER_B(name, F) \
+void glue(helper_pack ## name, SUFFIX)(CPUX86State *env, \
+ Reg *d, Reg *s) \
+{ \
+ Reg *v = d; \
+ uint8_t r[PACK_WIDTH * 2]; \
+ int i; \
+ PACK4(F, 0, v, 0); \
+ PACK4(F, PACK_WIDTH, s, 0); \
+ XMM_ONLY( \
+ PACK4(F, 4, v, 4); \
+ PACK4(F, 12, s, 4); \
+ ) \
The 4 in PACK4 looks suspiciously like it should be PACK_WIDTH --
that would eliminate the XMM_ONLY bit here.
void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
{
+ Reg *v = d;
+ uint16_t r[PACK_WIDTH];
+ int i, j, k;
+ for (i = 0, j = 0; i <= 2 << SHIFT; i += 8, j += 4) {
+ r[0] = satsw(v->L(j));
+ r[1] = satsw(v->L(j + 1));
+ r[PACK_WIDTH / 2 + 0] = satsw(s->L(j));
+ r[PACK_WIDTH / 2 + 1] = satsw(s->L(j + 1));
+#if SHIFT >= 1
+ r[2] = satsw(v->L(j + 2));
+ r[3] = satsw(v->L(j + 3));
+ r[6] = satsw(s->L(j + 2));
+ r[7] = satsw(s->L(j + 3));
#endif
+ for (k = 0; k < PACK_WIDTH; k++) {
+ d->W(i + k) = r[k];
+ }
+ }
Hmm. Better with nested loops?
for (j = 0; j <= 1 << SHIFT; j += 4) {
for (k = 0; k < PACK_WIDTH / 2; ++k) {
r[k] = satsw(v->L(j + k));
}
for (k = 0; k < PACK_WIDTH / 2; ++k) {
r[PACK_WIDTH / 2 + k] = satsw(s->L(j + k));
}
for (k = 0; k < PACK_WIDTH; k++) {
d->W(j * 2 + k) = r[k];
}
}
#define UNPCK_OP(base_name, base) \
\
void glue(helper_punpck ## base_name ## bw, SUFFIX)(CPUX86State *env,\
- Reg *d, Reg *s) \
+ Reg *d, Reg *s) \
{ \
- Reg r; \
+ Reg *v = d; \
+ uint8_t r[PACK_WIDTH * 2]; \
+ int i; \
\
- r.B(0) = d->B((base << (SHIFT + 2)) + 0); \
- r.B(1) = s->B((base << (SHIFT + 2)) + 0); \
- r.B(2) = d->B((base << (SHIFT + 2)) + 1); \
- r.B(3) = s->B((base << (SHIFT + 2)) + 1); \
- r.B(4) = d->B((base << (SHIFT + 2)) + 2); \
- r.B(5) = s->B((base << (SHIFT + 2)) + 2); \
- r.B(6) = d->B((base << (SHIFT + 2)) + 3); \
- r.B(7) = s->B((base << (SHIFT + 2)) + 3); \
+ r[0] = v->B((base * PACK_WIDTH) + 0); \
+ r[1] = s->B((base * PACK_WIDTH) + 0); \
+ r[2] = v->B((base * PACK_WIDTH) + 1); \
+ r[3] = s->B((base * PACK_WIDTH) + 1); \
+ r[4] = v->B((base * PACK_WIDTH) + 2); \
+ r[5] = s->B((base * PACK_WIDTH) + 2); \
+ r[6] = v->B((base * PACK_WIDTH) + 3); \
+ r[7] = s->B((base * PACK_WIDTH) + 3); \
XMM_ONLY( \
- r.B(8) = d->B((base << (SHIFT + 2)) + 4); \
- r.B(9) = s->B((base << (SHIFT + 2)) + 4); \
- r.B(10) = d->B((base << (SHIFT + 2)) + 5); \
- r.B(11) = s->B((base << (SHIFT + 2)) + 5); \
- r.B(12) = d->B((base << (SHIFT + 2)) + 6); \
- r.B(13) = s->B((base << (SHIFT + 2)) + 6); \
- r.B(14) = d->B((base << (SHIFT + 2)) + 7); \
- r.B(15) = s->B((base << (SHIFT + 2)) + 7); \
+ r[8] = v->B((base * PACK_WIDTH) + 4); \
+ r[9] = s->B((base * PACK_WIDTH) + 4); \
+ r[10] = v->B((base * PACK_WIDTH) + 5); \
+ r[11] = s->B((base * PACK_WIDTH) + 5); \
+ r[12] = v->B((base * PACK_WIDTH) + 6); \
+ r[13] = s->B((base * PACK_WIDTH) + 6); \
+ r[14] = v->B((base * PACK_WIDTH) + 7); \
+ r[15] = s->B((base * PACK_WIDTH) + 7); \
) \
- MOVE(*d, r); \
+ for (i = 0; i < PACK_WIDTH * 2; i++) { \
+ d->B(i) = r[i]; \
+ } \
} \
Surely better with loops, and more semi-conversions following.
r~
- Re: [PATCH 03/18] i386: Add CHECK_NO_VEX, (continued)
- [PATCH 06/18] i386: Rewrite vector shift helper, Paolo Bonzini, 2022/08/25
- [PATCH 07/18] i386: Rewrite simple integer vector helpers, Paolo Bonzini, 2022/08/25
- [PATCH 08/18] i386: Misc integer AVX helper prep, Paolo Bonzini, 2022/08/25
- [PATCH 05/18] i386: Add ZMM_OFFSET macro, Paolo Bonzini, 2022/08/25
- [PATCH 09/18] i386: Destructive vector helpers for AVX, Paolo Bonzini, 2022/08/25
- Re: [PATCH 09/18] i386: Destructive vector helpers for AVX,
Richard Henderson <=
- [PATCH 10/18] i386: Add size suffix to vector FP helpers, Paolo Bonzini, 2022/08/25
- [PATCH 11/18] i386: Floating point arithmetic helper AVX prep, Paolo Bonzini, 2022/08/25
- [PATCH 14/18] i386: Destructive FP helpers for AVX, Paolo Bonzini, 2022/08/25
- [PATCH 12/18] i386: reimplement AVX comparison helpers, Paolo Bonzini, 2022/08/25
- [PATCH 15/18] i386: Misc AVX helper prep, Paolo Bonzini, 2022/08/25