[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH 4/8] target/riscv: 128-bit arithmetic and logic instructions
From: |
Richard Henderson |
Subject: |
Re: [PATCH 4/8] target/riscv: 128-bit arithmetic and logic instructions |
Date: |
Mon, 30 Aug 2021 20:30:28 -0700 |
User-agent: |
Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Thunderbird/78.11.0 |
On 8/30/21 10:16 AM, Frédéric Pétrot wrote:
Adding the support for the 128-bit arithmetic and logic instructions.
Remember that all (i) instructions are now acting on 128-bit registers, that
a few others are added to cope with values that are held on 64 bits within
the 128-bit registers, and that the ones that cope with values on 32-bit
must also be modified for proper sign extension.
Most algorithms taken from Hackers' delight.
Signed-off-by: Frédéric Pétrot <frederic.petrot@univ-grenoble-alpes.fr>
Co-authored-by: Fabien Portas <fabien.portas@grenoble-inp.org>
---
target/riscv/insn32.decode | 13 +
target/riscv/insn_trans/trans_rvi.c.inc | 955 +++++++++++++++++++++++-
target/riscv/translate.c | 25 +
3 files changed, 976 insertions(+), 17 deletions(-)
diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode
index 225669e277..2fe7e1dd36 100644
--- a/target/riscv/insn32.decode
+++ b/target/riscv/insn32.decode
@@ -22,6 +22,7 @@
%rs1 15:5
%rd 7:5
%sh5 20:5
+%sh6 20:6
%sh7 20:7
%csr 20:12
@@ -91,6 +92,9 @@
# Formats 64:
@sh5 ....... ..... ..... ... ..... ....... &shift shamt=%sh5 %rs1
%rd
+# Formats 128:
+@sh6 ...... ...... ..... ... ..... ....... &shift shamt=%sh6 %rs1 %rd
+
# *** Privileged Instructions ***
ecall 000000000000 00000 000 00000 1110011
ebreak 000000000001 00000 000 00000 1110011
@@ -166,6 +170,15 @@ sraw 0100000 ..... ..... 101 ..... 0111011 @r
ldu ............ ..... 111 ..... 0000011 @i
lq ............ ..... 010 ..... 0001111 @i
sq ............ ..... 100 ..... 0100011 @s
+addid ............ ..... 000 ..... 1011011 @i
+sllid 000000 ...... ..... 001 ..... 1011011 @sh6
+srlid 000000 ...... ..... 101 ..... 1011011 @sh6
+sraid 010000 ...... ..... 101 ..... 1011011 @sh6
+addd 0000000 ..... ..... 000 ..... 1111011 @r
+subd 0100000 ..... ..... 000 ..... 1111011 @r
+slld 0000000 ..... ..... 001 ..... 1111011 @r
+srld 0000000 ..... ..... 101 ..... 1111011 @r
+srad 0100000 ..... ..... 101 ..... 1111011 @r
# *** RV32M Standard Extension ***
mul 0000001 ..... ..... 000 ..... 0110011 @r
diff --git a/target/riscv/insn_trans/trans_rvi.c.inc
b/target/riscv/insn_trans/trans_rvi.c.inc
index 772330a766..0401ba3d69 100644
--- a/target/riscv/insn_trans/trans_rvi.c.inc
+++ b/target/riscv/insn_trans/trans_rvi.c.inc
@@ -26,14 +26,20 @@ static bool trans_illegal(DisasContext *ctx, arg_empty *a)
static bool trans_c64_illegal(DisasContext *ctx, arg_empty *a)
{
- REQUIRE_64BIT(ctx);
- return trans_illegal(ctx, a);
+ REQUIRE_64_OR_128BIT(ctx);
+ return trans_illegal(ctx, a);
}
static bool trans_lui(DisasContext *ctx, arg_lui *a)
{
if (a->rd != 0) {
tcg_gen_movi_tl(cpu_gpr[a->rd], a->imm);
+#if defined(TARGET_RISCV128)
+ if (is_128bit(ctx)) {
+ tcg_gen_ext_i64_i128(cpu_gpr[a->rd], cpu_gprh[a->rd],
+ cpu_gpr[a->rd]);
+ }
+#endif
}
return true;
}
I think it is a mistake to introduce all of these ifdefs.
If 128-bit is not enabled, then is_128bit should evaluate to false, and all should be
well. As for cpu_gprh[], that should be hidden behind some function/macro so that if
128-bit is not enabled we get qemu_build_not_reached().
Finally, you can compute this immediate value directly. Don't leave it to the optimizer
to provide the extension.
static bool trans_auipc(DisasContext *ctx, arg_auipc *a)
{
if (a->rd != 0) {
+#if defined(TARGET_RISCV128)
+ if (is_128bit(ctx)) {
+ /* TODO : when pc is 128 bits, use all its bits */
+ TCGv pc = tcg_const_tl(ctx->base.pc_next),
+ imm = tcg_const_tl(a->imm),
+ immh = tcg_const_tl((a->imm & 0x80000)
+ ? 0xffffffffffffffff : 0),
No need to test bits here: -(a->imm < 0) will do fine.
+ cnst_zero = tcg_const_tl(0);
+ tcg_gen_add2_tl(cpu_gpr[a->rd], cpu_gprh[a->rd], pc, cnst_zero,
+ imm, immh);
+ tcg_temp_free(pc);
+ tcg_temp_free(imm);
+ tcg_temp_free(immh);
+ tcg_temp_free(cnst_zero);
+ return true;
tcg_constant_tl, not tcg_const_tl and no tcg_temp_free.
+ case TCG_COND_LT:
+ {
+ TCGv tmp1 = tcg_temp_new(),
+ tmp2 = tcg_temp_new();
+
+ tcg_gen_xor_tl(tmp1, rh, ah);
+ tcg_gen_xor_tl(tmp2, ah, bh);
+ tcg_gen_and_tl(tmp1, tmp1, tmp2);
+ tcg_gen_xor_tl(tmp1, rh, tmp1);
+ tcg_gen_setcondi_tl(TCG_COND_LT, rl, tmp1, 0); /* Check sign bit */
+
+ tcg_temp_free(tmp1);
+ tcg_temp_free(tmp2);
+ break;
+ }
Incorrect, as you're not examining the low parts at all.
+
+ case TCG_COND_GE:
+ /* We invert the result of TCG_COND_LT */
+ gen_setcond_128(rl, rh, al, ah, bl, bh, TCG_COND_LT);
+ tcg_gen_setcondi_tl(TCG_COND_EQ, rl, rl, 0);
Inversion of a boolean is better as xor with 1.
+ case TCG_COND_LTU:
+ {
+ TCGv tmp1 = tcg_temp_new(),
+ tmp2 = tcg_temp_new();
+
+ tcg_gen_eqv_tl(tmp1, ah, bh);
+ tcg_gen_and_tl(tmp1, tmp1, rh);
+ tcg_gen_not_tl(tmp2, ah);
+ tcg_gen_and_tl(tmp2, tmp2, bh);
+ tcg_gen_or_tl(tmp1, tmp1, tmp2);
+
+ tcg_gen_setcondi_tl(TCG_COND_LT, rl, tmp1, 0); /* Check sign bit */
+
+ tcg_temp_free(tmp1);
+ tcg_temp_free(tmp2);
+ break;
+ }
Again, missing comparison of low parts.
@@ -93,7 +205,28 @@ static bool gen_branch(DisasContext *ctx, arg_b *a, TCGCond
cond)
gen_get_gpr(source1, a->rs1);
gen_get_gpr(source2, a->rs2);
+#if defined(TARGET_RISCV128)
+ if (is_128bit(ctx)) {
+ TCGv source1h, source2h, tmph, tmpl;
+ source1h = tcg_temp_new();
+ source2h = tcg_temp_new();
+ tmph = tcg_temp_new();
+ tmpl = tcg_temp_new();
+ gen_get_gprh(source1h, a->rs1);
+ gen_get_gprh(source2h, a->rs2);
+
+ gen_setcond_128(tmpl, tmph, source1, source1h, source2, source2h,
cond);
+ tcg_gen_brcondi_tl(TCG_COND_NE, tmpl, 0, l);
+ tcg_temp_free(source1h);
+ tcg_temp_free(source2h);
+ tcg_temp_free(tmph);
+ tcg_temp_free(tmpl);
setcond feeding brcond results in too many comparisons, usually.
In this instance it may be just as easy to generate multiple branches, in general. But
EQ/NE should not be
setcond t1, al, bl, eq
setcond t2, ah, bh, eq
and t3, t1, t1
brcond t3, 0, ne
but
xor t1, al, bl
xor t2, ah, bh
or t3, t1, t2
brcond t3, 0, eq
static bool trans_srli(DisasContext *ctx, arg_srli *a)
{
+#if defined(TARGET_RISCV128)
+ if (is_128bit(ctx)) {
+ if (a->shamt >= 128) {
+ return false;
+ }
+
+ if (a->rd != 0 && a->shamt != 0) {
+ TCGv rs = tcg_temp_new(),
+ rsh = tcg_temp_new(),
+ res = tcg_temp_new(),
+ resh = tcg_temp_new(),
+ tmp = tcg_temp_new();
+ gen_get_gpr(rs, a->rs1);
+ gen_get_gprh(rsh, a->rs1);
+
+ /*
+ * Computation of double-length right logical shift,
+ * adapted for immediates from section 2.17 of Hacker's Delight
+ */
+ if (a->shamt >= 64) {
+ tcg_gen_movi_tl(res, 0);
+ } else {
+ tcg_gen_shri_tl(res, rs, a->shamt);
+ }
+ if (64 - a->shamt < 0) {
+ tcg_gen_movi_tl(tmp, 0);
+ } else {
+ tcg_gen_shli_tl(tmp, rsh, 64 - a->shamt);
+ }
+ tcg_gen_or_tl(res, res, tmp);
+ if (a->shamt - 64 < 0) {
+ tcg_gen_movi_tl(tmp, 0);
+ } else {
+ tcg_gen_shri_tl(tmp, rsh, a->shamt - 64);
+ }
+ tcg_gen_or_tl(res, res, tmp);
+
+ if (a->shamt >= 64) {
+ tcg_gen_movi_tl(resh, 0);
+ } else {
+ tcg_gen_shri_tl(resh, rsh, a->shamt);
+ }
We have tcg_gen_extract2_i64 for the purpose of double-word immediate shifts. You should
be doing
if (shamt >= 64) {
tcg_gen_shri_tl(resl, srch, shamt - 64);
tcg_gen_movi_tl(resh, 0);
} else {
tcg_gen_extract2_tl(resl, srcl, srch, shamt);
tcg_gen_shri_tl(resh, srch, shamt);
}
static bool trans_srai(DisasContext *ctx, arg_srai *a)
if (shamt >= 64) {
tcg_gen_sari_tl(resl, srch, shamt - 64);
tcg_gen_sari_tl(resh, srch, 63);
} else {
tcg_gen_extract2_tl(resl, srcl, srch, shamt);
tcg_gen_sari_tl(resh, srch, shamt);
}
static bool trans_slli(DisasContext *ctx, arg_slli *a)
if (shamt >= 64) {
tcg_gen_shli_tl(resh, srcl, shamt - 64);
tcg_gen_movi_tl(resl, 0);
} else {
tcg_gen_extract2_tl(resh, srcl, srch, 64 - shamt);
tcg_gen_shli_tl(resl, srcl, shamt);
}
C.f. tcg/tcg-op.c, tcg_gen_shifti_i64, which is doing the same thing for i32.
+#if defined(TARGET_RISCV128)
+enum M128_DIR { M128_LEFT, M128_RIGHT, M128_RIGHT_ARITH };
+static void gen_shift_mod128(TCGv ret, TCGv arg1, TCGv arg2, enum M128_DIR dir)
+{
+ TCGv tmp1 = tcg_temp_new(),
+ tmp2 = tcg_temp_new(),
+ cnst_zero = tcg_const_tl(0),
+ sgn = tcg_temp_new();
+
+ tcg_gen_setcondi_tl(TCG_COND_GE, tmp1, arg2, 64);
+ tcg_gen_setcondi_tl(TCG_COND_LT, tmp2, arg2, 0);
+ tcg_gen_or_tl(tmp1, tmp1, tmp2);
What in the world are you doing with signed comparisons?
+ tcg_gen_andi_tl(tmp2, arg2, 0x3f);
You should have one test with 0x3f and one with 0x40.
r~
- [PATCH 3/8] target/riscv: Addition of 128-bit ldu, lq and sq instructions, (continued)
- [PATCH 6/8] target/riscv: Support of compiler's 128-bit integer types, Frédéric Pétrot, 2021/08/30
- [PATCH 5/8] target/riscv: 128-bit multiply and divide, Frédéric Pétrot, 2021/08/30
- [PATCH 8/8] target/riscv: Support for 128-bit satp, Frédéric Pétrot, 2021/08/30
- [PATCH 7/8] target/riscv: 128-bit support for some csrs, Frédéric Pétrot, 2021/08/30
- Re: [PATCH 1/8] target/riscv: Settings for 128-bit extension support, Alistair Francis, 2021/08/30