[PATCH 20/35] target/i386: reimplement 0x0f 0x78-0x7f, add AVX

qemu-devel

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 20/35] target/i386: reimplement 0x0f 0x78-0x7f, add AVX

From:	Paolo Bonzini
Subject:	[PATCH 20/35] target/i386: reimplement 0x0f 0x78-0x7f, add AVX
Date:	Thu, 13 Oct 2022 23:46:36 +0200

These are a mixed batch, including the first two horizontal
(66 and F2 only) operations, more moves, and SSE4a extract/insert.

Because SSE4a is pretty rare, I chose to leave the helper as they are,
but it is possible to unify them by loading index and length from the
source XMM register and generating deposit or extract TCG ops.

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/tcg/decode-new.c.inc | 51 +++++++++++++++++++
 target/i386/tcg/emit.c.inc       | 86 ++++++++++++++++++++++++++++++++
 target/i386/tcg/translate.c      |  1 +
 3 files changed, 138 insertions(+)

diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
index abb99bfe51..df38c12a2d 100644
--- a/target/i386/tcg/decode-new.c.inc
+++ b/target/i386/tcg/decode-new.c.inc
@@ -168,6 +168,50 @@ static void decode_0F6F(DisasContext *s, CPUX86State *env, 
X86OpEntry *entry, ui
     *entry = *decode_by_prefix(s, opcodes_0F6F);
 }
 
+static void decode_0F78(DisasContext *s, CPUX86State *env, X86OpEntry *entry, 
uint8_t *b)
+{
+    static const X86OpEntry opcodes_0F78[4] = {
+        {},
+        X86_OP_ENTRY3(EXTRQ_i,       V,x, None,None, I,w,  cpuid(SSE4A)),
+        {},
+        X86_OP_ENTRY3(INSERTQ_i,     V,x, U,x, I,w,        cpuid(SSE4A)),
+    };
+    *entry = *decode_by_prefix(s, opcodes_0F78);
+}
+
+static void decode_0F79(DisasContext *s, CPUX86State *env, X86OpEntry *entry, 
uint8_t *b)
+{
+    if (s->prefix & PREFIX_REPNZ) {
+        entry->gen = gen_INSERTQ_r;
+    } else if (s->prefix & PREFIX_DATA) {
+        entry->gen = gen_EXTRQ_r;
+    } else {
+        entry->gen = NULL;
+    };
+}
+
+static void decode_0F7E(DisasContext *s, CPUX86State *env, X86OpEntry *entry, 
uint8_t *b)
+{
+    static const X86OpEntry opcodes_0F7E[4] = {
+        X86_OP_ENTRY3(MOVD_from,  E,y, None,None, P,y, vex5 mmx),
+        X86_OP_ENTRY3(MOVD_from,  E,y, None,None, V,y, vex5),
+        X86_OP_ENTRY3(MOVQ,       V,x, None,None, W,q, vex5),  /* wrong dest 
Vy on SDM! */
+        {},
+    };
+    *entry = *decode_by_prefix(s, opcodes_0F7E);
+}
+
+static void decode_0F7F(DisasContext *s, CPUX86State *env, X86OpEntry *entry, 
uint8_t *b)
+{
+    static const X86OpEntry opcodes_0F7F[4] = {
+        X86_OP_ENTRY3(MOVDQ,       W,x, None,None, V,x, vex1 mmx), /* movq */
+        X86_OP_ENTRY3(MOVDQ,       W,x, None,None, V,x, vex1), /* movdqa */
+        X86_OP_ENTRY3(MOVDQ,       W,x, None,None, V,x, vex4_unal), /* movdqu 
*/
+        {},
+    };
+    *entry = *decode_by_prefix(s, opcodes_0F7F);
+}
+
 static const X86OpEntry opcodes_0F38_00toEF[240] = {
 };
 
@@ -317,6 +361,13 @@ static const X86OpEntry opcodes_0F[256] = {
     [0x6e] = X86_OP_ENTRY3(MOVD_to,    V,x, None,None, E,y, vex5 mmx p_00_66), 
 /* wrong dest Vy on SDM! */
     [0x6f] = X86_OP_GROUP0(0F6F),
 
+    [0x78] = X86_OP_GROUP0(0F78),
+    [0x79] = X86_OP_GROUP2(0F79,       V,x, U,x,       cpuid(SSE4A)),
+    [0x7c] = X86_OP_ENTRY3(VHADD,      V,x, H,x, W,x,  vex2 cpuid(SSE3) 
p_66_f2),
+    [0x7d] = X86_OP_ENTRY3(VHSUB,      V,x, H,x, W,x,  vex2 cpuid(SSE3) 
p_66_f2),
+    [0x7e] = X86_OP_GROUP0(0F7E),
+    [0x7f] = X86_OP_GROUP0(0F7F),
+
     /* Incorrectly missing from 2-17 */
     [0xd8] = X86_OP_ENTRY3(PSUBUSB,  V,x, H,x, W,x,  vex4 mmx avx2_256 
p_00_66),
     [0xd9] = X86_OP_ENTRY3(PSUBUSW,  V,x, H,x, W,x,  vex4 mmx avx2_256 
p_00_66),
diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
index e063d2fe04..7e13fb5869 100644
--- a/target/i386/tcg/emit.c.inc
+++ b/target/i386/tcg/emit.c.inc
@@ -443,6 +443,30 @@ static void gen_##uname(DisasContext *s, CPUX86State *env, 
X86DecodedInsn *decod
 UNARY_FP32_SSE(VRSQRT, rsqrt)
 UNARY_FP32_SSE(VRCP, rcp)
 
+/*
+ * 66 = v*pd Vpd, Hpd, Wpd
+ * f2 = v*ps Vps, Hps, Wps
+ */
+static inline void gen_horizontal_fp_sse(DisasContext *s, CPUX86State *env, 
X86DecodedInsn *decode,
+                                         SSEFunc_0_eppp pd_xmm, SSEFunc_0_eppp 
ps_xmm,
+                                         SSEFunc_0_eppp pd_ymm, SSEFunc_0_eppp 
ps_ymm)
+{
+    SSEFunc_0_eppp ps, pd, fn;
+    ps = s->vex_l ? ps_ymm : ps_xmm;
+    pd = s->vex_l ? pd_ymm : pd_xmm;
+    fn = s->prefix & PREFIX_DATA ? pd : ps;
+    fn(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2);
+}
+#define HORIZONTAL_FP_SSE(uname, lname)                                        
    \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn 
*decode) \
+{                                                                              
    \
+    gen_horizontal_fp_sse(s, env, decode,                                      
    \
+                          gen_helper_##lname##pd_xmm, 
gen_helper_##lname##ps_xmm,  \
+                          gen_helper_##lname##pd_ymm, 
gen_helper_##lname##ps_ymm); \
+}
+HORIZONTAL_FP_SSE(VHADD, hadd)
+HORIZONTAL_FP_SSE(VHSUB, hsub)
+
 #define BINARY_INT_GVEC(uname, func, ...)                                      
    \
 static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn 
*decode) \
 {                                                                              
    \
@@ -716,6 +740,32 @@ static void gen_CRC32(DisasContext *s, CPUX86State *env, 
X86DecodedInsn *decode)
     gen_helper_crc32(s->T0, s->tmp2_i32, s->T1, tcg_constant_i32(8 << ot));
 }
 
+static void gen_EXTRQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn 
*decode)
+{
+    TCGv_i32 length = tcg_constant_i32(decode->immediate & 63);
+    TCGv_i32 index = tcg_constant_i32((decode->immediate >> 8) & 63);
+
+    gen_helper_extrq_i(cpu_env, OP_PTR0, index, length);
+}
+
+static void gen_EXTRQ_r(DisasContext *s, CPUX86State *env, X86DecodedInsn 
*decode)
+{
+    gen_helper_extrq_r(cpu_env, OP_PTR0, OP_PTR2);
+}
+
+static void gen_INSERTQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn 
*decode)
+{
+    TCGv_i32 length = tcg_constant_i32(decode->immediate & 63);
+    TCGv_i32 index = tcg_constant_i32((decode->immediate >> 8) & 63);
+
+    gen_helper_insertq_i(cpu_env, OP_PTR0, OP_PTR1, index, length);
+}
+
+static void gen_INSERTQ_r(DisasContext *s, CPUX86State *env, X86DecodedInsn 
*decode)
+{
+    gen_helper_insertq_r(cpu_env, OP_PTR0, OP_PTR2);
+}
+
 static void gen_MOVBE(DisasContext *s, CPUX86State *env, X86DecodedInsn 
*decode)
 {
     MemOp ot = decode->op[0].ot;
@@ -728,6 +778,24 @@ static void gen_MOVBE(DisasContext *s, CPUX86State *env, 
X86DecodedInsn *decode)
     }
 }
 
+static void gen_MOVD_from(DisasContext *s, CPUX86State *env, X86DecodedInsn 
*decode)
+{
+    MemOp ot = decode->op[2].ot;
+
+    switch (ot) {
+    case MO_32:
+#ifdef TARGET_X86_64
+        tcg_gen_ld32u_tl(s->T0, cpu_env, decode->op[2].offset);
+        break;
+    case MO_64:
+#endif
+        tcg_gen_ld_tl(s->T0, cpu_env, decode->op[2].offset);
+        break;
+    default:
+        abort();
+    }
+}
+
 static void gen_MOVD_to(DisasContext *s, CPUX86State *env, X86DecodedInsn 
*decode)
 {
     MemOp ot = decode->op[2].ot;
@@ -765,6 +833,24 @@ static void gen_MOVMSK(DisasContext *s, CPUX86State *env, 
X86DecodedInsn *decode
     tcg_gen_extu_i32_tl(s->T0, s->tmp2_i32);
 }
 
+static void gen_MOVQ(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+    int vec_len = vector_len(s, decode);
+    int lo_ofs = vector_elem_offset(&decode->op[0], MO_64, 0);
+
+    tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[2].offset);
+    /*
+     * tcg_gen_gvec_dup_i64(MO_64, op0.offset, 8, vec_len, s->tmp1_64) would
+     * seem to work, but it does not on big-endian platforms; the cleared parts
+     * are always at higher addresses, but cross-endian emulation inverts the
+     * byte order so that the cleared parts need to be at *lower* addresses.
+     * Because oprsz is 8, we see this here even for SSE; but more in general,
+     * it disqualifies using oprsz < maxsz to emulate VEX128.
+     */
+    tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
+    tcg_gen_st_i64(s->tmp1_i64, cpu_env, lo_ofs);
+}
+
 static void gen_MULX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
 {
     MemOp ot = decode->op[0].ot;
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 8e382ce0b4..d87c7b3bac 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -4775,6 +4775,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
 #endif
         if (use_new &&
             ((b >= 0x150 && b <= 0x16f) ||
+             (b >= 0x178 && b <= 0x17f) ||
              (b >= 0x1d8 && b <= 0x1ff && (b & 8)))) {
             disas_insn_new(s, cpu, b + 0x100);
             return s->pc;
-- 
2.37.3

[Prev in Thread]

Current Thread

[Next in Thread]

[PATCH 16/35] target/i386: Introduce 256-bit vector helpers, (continued)
- [PATCH 16/35] target/i386: Introduce 256-bit vector helpers, Paolo Bonzini, 2022/10/13
- [PATCH 08/35] target/i386: validate VEX prefixes via the instructions' exception classes, Paolo Bonzini, 2022/10/13
- [PATCH 13/35] target/i386: support operand merging in binary scalar helpers, Paolo Bonzini, 2022/10/13
- [PATCH 19/35] target/i386: reimplement 0x0f 0x50-0x5f, add AVX, Paolo Bonzini, 2022/10/13
- [PATCH 14/35] target/i386: provide 3-operand versions of unary scalar helpers, Paolo Bonzini, 2022/10/13
- [PATCH 28/35] target/i386: reimplement 0x0f 0x10-0x17, add AVX, Paolo Bonzini, 2022/10/13
- [PATCH 35/35] target/i386: remove old SSE decoder, Paolo Bonzini, 2022/10/13
- [PATCH 18/35] target/i386: reimplement 0x0f 0xd8-0xdf, 0xe8-0xef, 0xf8-0xff, add AVX, Paolo Bonzini, 2022/10/13
- [PATCH 17/35] target/i386: reimplement 0x0f 0x60-0x6f, add AVX, Paolo Bonzini, 2022/10/13
- [PATCH 15/35] target/i386: implement additional AVX comparison operators, Paolo Bonzini, 2022/10/13
- [PATCH 20/35] target/i386: reimplement 0x0f 0x78-0x7f, add AVX, Paolo Bonzini <=
- [PATCH 30/35] target/i386: implement XSAVE and XRSTOR of AVX registers, Paolo Bonzini, 2022/10/13
- [PATCH 21/35] target/i386: reimplement 0x0f 0x70-0x77, add AVX, Paolo Bonzini, 2022/10/13
- [PATCH 32/35] target/i386: Enable AVX cpuid bits when using TCG, Paolo Bonzini, 2022/10/13
- [PATCH 11/35] target/i386: Prepare ops_sse_header.h for 256 bit AVX, Paolo Bonzini, 2022/10/13
- [PATCH 12/35] target/i386: extend helpers to support VEX.V 3- and 4- operand encodings, Paolo Bonzini, 2022/10/13
- [PATCH 24/35] target/i386: reimplement 0x0f 0x3a, add AVX, Paolo Bonzini, 2022/10/13
- [PATCH 31/35] target/i386: implement VLDMXCSR/VSTMXCSR, Paolo Bonzini, 2022/10/13
- [PATCH 23/35] target/i386: clarify (un)signedness of immediates from 0F3Ah opcodes, Paolo Bonzini, 2022/10/13
- [PATCH 27/35] target/i386: reimplement 0x0f 0xc2, 0xc4-0xc6, add AVX, Paolo Bonzini, 2022/10/13
- [PATCH 34/35] target/i386: move 3DNow to the new decoder, Paolo Bonzini, 2022/10/13

Prev by Date: [PATCH 15/35] target/i386: implement additional AVX comparison operators
Next by Date: [PATCH 30/35] target/i386: implement XSAVE and XRSTOR of AVX registers
Previous by thread: [PATCH 15/35] target/i386: implement additional AVX comparison operators
Next by thread: [PATCH 30/35] target/i386: implement XSAVE and XRSTOR of AVX registers
Index(es):
- Date
- Thread