qemu-riscv
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v4 12/17] target/riscv: support for 128-bit arithmetic instru


From: Richard Henderson
Subject: Re: [PATCH v4 12/17] target/riscv: support for 128-bit arithmetic instructions
Date: Tue, 2 Nov 2021 08:43:28 -0400
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Thunderbird/78.13.0

On 10/25/21 5:28 AM, Frédéric Pétrot wrote:
+    if (get_xl(ctx) < MXL_RV128 || get_ol(ctx) < MXL_RV128) {

Only the second test.  Two times.

+static bool gen_setcond_i128(TCGv rl, TCGv rh,
+                             TCGv al, TCGv ah,
+                             TCGv bl, TCGv bh,
+                             TCGCond cond)
+{
+    switch (cond) {
+    case TCG_COND_EQ:
+        tcg_gen_xor_tl(rl, al, bl);
+        tcg_gen_xor_tl(rh, ah, bh);
+        tcg_gen_or_tl(rh, rl, rh);
+        tcg_gen_setcondi_tl(TCG_COND_EQ, rl, rh, 0);
+    break;

Indentation.

+
+    case TCG_COND_NE:
+        tcg_gen_xor_tl(rl, al, bl);
+        tcg_gen_xor_tl(rh, ah, bh);
+        tcg_gen_or_tl(rh, rl, rh);
+        tcg_gen_setcondi_tl(TCG_COND_NE, rl, rh, 0);
+        break;
+
+    case TCG_COND_LT:
+    {
+        TCGv tmp1 = tcg_temp_new();
+        TCGv tmp2 = tcg_temp_new();
+
+        tcg_gen_sub2_tl(rl, rh, al, ah, bl, bh);
+        tcg_gen_xor_tl(tmp1, rh, ah);
+        tcg_gen_xor_tl(tmp2, ah, bh);
+        tcg_gen_and_tl(tmp1, tmp1, tmp2);
+        tcg_gen_xor_tl(tmp1, rh, tmp1);
+        tcg_gen_shri_tl(rl, tmp1, 63);
+
+        tcg_temp_free(tmp1);
+        tcg_temp_free(tmp2);
+        break;
+    }
+
+    case TCG_COND_GE:
+        /* Invert result of TCG_COND_LT */
+        gen_setcond_i128(rl, rh, al, ah, bl, bh, TCG_COND_LT);
+        tcg_gen_xori_tl(rl, rl, 1);
+        break;
+
+    case TCG_COND_LTU:
+        gen_setcond_i128(rl, rh, al, ah, bl, bh, TCG_COND_GEU);
+        tcg_gen_xori_tl(rl, rl, 1);
+        break;
+
+    case TCG_COND_GEU:
+    {
+        TCGv tmp1 = tcg_temp_new();
+        TCGv tmp2 = tcg_temp_new();
+        TCGv zero = tcg_constant_tl(0);
+        TCGv one = tcg_constant_tl(1);
+        /* borrow in to second word */
+        tcg_gen_setcond_tl(TCG_COND_LTU, tmp1, al, bl);
+        /* seed third word with 1, which will be result */
+        tcg_gen_sub2_tl(tmp1, tmp2, ah, one, tmp1, zero);
+        tcg_gen_sub2_tl(tmp1, rl, tmp1, tmp2, bh, zero);
+
+        tcg_temp_free(tmp1);
+        tcg_temp_free(tmp2);
+        break;
+    }
+
+    default:
+        return false;

This should be g_assert_not_reached(), not return false.

+    }
+    tcg_gen_movi_tl(rh, 0);
+    return true;
+}

Which begs the question of what the return value is for when you don't even use 
it below.

I think we should treat this as more of a subroutine, and return the final TCGCond required to get the correct result, *without* actually computing the final setcond.

static TCGCond compare_128i(TCGv rl, int rs1, int rs2, TCGCond cond)
{
    TCGv al = get_gpr(ctx, rs1, EXT_SIGN);
    TCGv bl = get_gpr(ctx, rs2, EXT_SIGN);
    TCGv ah = get_gprh(ctx, a->rs1);
    TCGv bh = get_gprh(ctx, a->rs2);
    TCGv rh = tcg_temp_new();
    bool invert = false;

    switch (cond) {
    case TCG_COND_EQ:
    case TCG_COND_NE:
        if (rs2 == 0) {
            tcg_gen_or_tl(rl, al, ah);
        } else {
            tcg_gen_xor_tl(rl, al, bl);
            tcg_gen_xor_tl(rh, ah, bh);
            tcg_gen_or_tl(rl, rl, rh);
        }
        break;

    case TCG_COND_GE:
        invert = true;
        cond = TCG_COND_LT;
        /* fall through */
    case TCG_COND_LT:
        if (rs2 == 0) {
            tcg_gen_mov_tl(rl, ah);
        } else {
            TCGv tmp1 = tcg_temp_new();
            TCGv tmp2 = tcg_temp_new();
            tcg_gen_sub2_tl(rl, rh, al, ah, bl, bh);
            tcg_gen_xor_tl(tmp1, rh, ah);
            tcg_gen_xor_tl(tmp2, ah, bh);
            tcg_gen_and_tl(tmp1, tmp1, tmp2);
            tcg_gen_xor_tl(rl, rh, tmp1);
            tcg_temp_free(tmp1);
            tcg_temp_free(tmp2);
        }
        break;
    ...
    }
    tcg_temp_free(rh);

    if (invert) {
        cond = tcg_invert_cond(cond);
    }
    return cond;
}

static void setcond_128i(...)
{
    cond = compare_128i(rl, rh, al, ah, bl, bh, cond);
    tcg_gen_setcondi_tl(cond, rl, rl, 0);
    tcg_gen_movi_tl(rh, 0);
}

static void branch_128i(...)
{
    TCGv tmpl = tcg_temp_new();
    TCGv tmph = tcg_temp_new();

    cond = compare_128i(tmpl, tmph, al, ah, bl, bh, cond);
    tcg_gen_brcondi_tl(cond, tmpl, 0, label);

    tcg_temp_free(tmpl);
    tcg_temp_free(tmph);
}

+
  static bool gen_branch(DisasContext *ctx, arg_b *a, TCGCond cond)
  {
      TCGLabel *l = gen_new_label();
      TCGv src1 = get_gpr(ctx, a->rs1, EXT_SIGN);
      TCGv src2 = get_gpr(ctx, a->rs2, EXT_SIGN);
- tcg_gen_brcond_tl(cond, src1, src2, l);
+    if (get_xl(ctx) == MXL_RV128) {
+        TCGv src1h = get_gprh(ctx, a->rs1);
+        TCGv src2h = get_gprh(ctx, a->rs2);
+        TCGv tmpl = tcg_temp_new();
+        TCGv tmph = tcg_temp_new();
+
+        /*
+         * Do not use gen_setcond_i128 for EQ and NE as these conditions are
+         * often met and can be more efficiently implemented.
+         * Some comparisons with zero are also simpler, let's do them.
+         */
+        if (cond == TCG_COND_EQ || cond == TCG_COND_NE) {
+            /*
+             * bnez and beqz being used quite often too, lets optimize them,
+             * although QEMU's tcg optimizer handles these cases nicely
+             */
+            if (a->rs2 == 0) {
+                tcg_gen_or_tl(tmpl, src1, src1h);
+                tcg_gen_brcondi_tl(cond, tmpl, 0, l);
+            } else {
+                tcg_gen_xor_tl(tmpl, src1, src2);
+                tcg_gen_xor_tl(tmph, src1h, src2h);
+                tcg_gen_or_tl(tmpl, tmpl, tmph);
+                tcg_gen_brcondi_tl(cond, tmpl, 0, l);
+            }
+        } else if (a->rs2 == 0
+                   && (cond == TCG_COND_LT || cond == TCG_COND_GE)) {
+            tcg_gen_shri_tl(tmpl, src1h, 63);
+            /* hack: transform LT in EQ and GE in NE */
+            tcg_gen_brcondi_tl((cond & 13) | 8, tmpl, 1, l);
+        } else {
+            if (cond == TCG_COND_GE || cond == TCG_COND_LTU) {
+                gen_setcond_i128(tmpl, tmph, src1, src1h, src2, src2h,
+                                 tcg_invert_cond(cond));
+                tcg_gen_brcondi_tl(TCG_COND_EQ, tmpl, 0, l);
+            } else {
+                gen_setcond_i128(tmpl, tmph, src1, src1h, src2, src2h, cond);
+                tcg_gen_brcondi_tl(TCG_COND_NE, tmpl, 0, l);
+            }
+        }

Which then takes care of all of this.

BTW, the hack was quite a hack, and quite unnecessary. You could have dropped the shift and performed the LT/GE brcond directly on src1h.


r~



reply via email to

[Prev in Thread] Current Thread [Next in Thread]