qemu-riscv
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v3 15/21] target/riscv: support for 128-bit M extension


From: Richard Henderson
Subject: Re: [PATCH v3 15/21] target/riscv: support for 128-bit M extension
Date: Wed, 20 Oct 2021 13:58:05 -0700
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Thunderbird/78.13.0

On 10/19/21 2:48 AM, Frédéric Pétrot wrote:
  struct CPURISCVState {
      target_ulong gpr[32];
      target_ulong gprh[32]; /* 64 top bits of the 128-bit registers */
+    target_ulong hlpr[2];  /* scratch registers for 128-bit div/rem helpers */

We have something similar for s390x, but we make use of the helper return value to return one part of the result and only store the other part of the result in env->retxl.

+    cpu_hlpr[0] = tcg_global_mem_new(cpu_env,
+        offsetof(CPURISCVState, hlpr[0]), "helper_reg0");
+    cpu_hlpr[1] = tcg_global_mem_new(cpu_env,
+        offsetof(CPURISCVState, hlpr[1]), "helper_reg1");

You very much do not want to make these global temps.

This requires the helpers to indicate that they clobber temps, which will flush all cached register state across the helper. Just perform the load of the result explicitly after the helper.

+static void gen_mulu2_i128(TCGv rll, TCGv rlh, TCGv rhl, TCGv rhh,
+                           TCGv al, TCGv ah, TCGv bl, TCGv bh)
+{
+    TCGv tmpl = tcg_temp_new(),
+         tmph = tcg_temp_new(),
+         cnst_zero = tcg_constant_tl(0);
+
+    tcg_gen_mulu2_tl(rll, rlh, al, bl);
+
+    tcg_gen_mulu2_tl(tmpl, tmph, al, bh);
+    tcg_gen_add2_tl(rlh, rhl, rlh, cnst_zero, tmpl, tmph);
+    tcg_gen_mulu2_tl(tmpl, tmph, ah, bl);
+    tcg_gen_add2_tl(rlh, tmph, rlh, rhl, tmpl, tmph);
+    /* Overflow detection into rhh */
+    tcg_gen_setcond_tl(TCG_COND_LTU, rhh, tmph, rhl);
+
+    tcg_gen_mov_tl(rhl, tmph);
+
+    tcg_gen_mulu2_tl(tmpl, tmph, ah, bh);
+    tcg_gen_add2_tl(rhl, rhh, rhl, rhh, tmpl, tmph);

It might be clearer to number these 0-3 rather than permute [lh].

I think you don't need to return all 4 words of results; just have gen_mulhu_i128 with 6 parameters, since there's no RV128 instruction that returns the entire result.

+static void gen_mul_i128(TCGv rll, TCGv rlh,
+                         TCGv rs1l, TCGv rs1h, TCGv rs2l, TCGv rs2h)
+{
+    TCGv rhl = tcg_temp_new(),
+         rhh = tcg_temp_new();
+
+    gen_mulu2_i128(rll, rlh, rhl, rhh, rs1l, rs1h, rs2l, rs2h);
+
+    tcg_temp_free(rhl);
+    tcg_temp_free(rhh);
+}

This is much simpler than gen_mulu2_i128.

+static void gen_mulh_i128(TCGv rhl, TCGv rhh,
+                          TCGv rs1l, TCGv rs1h, TCGv rs2l, TCGv rs2h)
+{
+    TCGv rll = tcg_temp_new(),
+         rlh = tcg_temp_new(),
+         rlln = tcg_temp_new(),
+         rlhn = tcg_temp_new(),
+         rhln = tcg_temp_new(),
+         rhhn = tcg_temp_new(),
+         sgnres = tcg_temp_new(),
+         tmp = tcg_temp_new(),
+         cnst_one = tcg_constant_tl(1),
+         cnst_zero = tcg_constant_tl(0);
+
+    /* Extract sign of result (=> sgn(a) xor sgn(b)) */
+    tcg_gen_setcondi_tl(TCG_COND_LT, sgnres, rs1h, 0);
+    tcg_gen_setcondi_tl(TCG_COND_LT, tmp, rs2h, 0);
+    tcg_gen_xor_tl(sgnres, sgnres, tmp);
+
+    /* Take absolute value of operands */
+    tcg_gen_sari_tl(rhl, rs1h, 63);
+    tcg_gen_add2_tl(rlln, rlhn, rs1l, rs1h, rhl, rhl);
+    tcg_gen_xor_tl(rlln, rlln, rhl);
+    tcg_gen_xor_tl(rlhn, rlhn, rhl);
+
+    tcg_gen_sari_tl(rhl, rs2h, 63);
+    tcg_gen_add2_tl(rhln, rhhn, rs2l, rs2h, rhl, rhl);
+    tcg_gen_xor_tl(rhln, rhln, rhl);
+    tcg_gen_xor_tl(rhhn, rhhn, rhl);
+
+    /* Unsigned multiplication */
+    gen_mulu2_i128(rll, rlh, rhl, rhh, rlln, rlhn, rhln, rhhn);
+
+    /* Negation of result (two's complement : ~res + 1) */
+    tcg_gen_not_tl(rlln, rll);
+    tcg_gen_not_tl(rlhn, rlh);
+    tcg_gen_not_tl(rhln, rhl);
+    tcg_gen_not_tl(rhhn, rhh);
+
+    tcg_gen_add2_tl(rlln, tmp, rlln, cnst_zero, cnst_one, cnst_zero);
+    tcg_gen_add2_tl(rlhn, tmp, rlhn, cnst_zero, tmp, cnst_zero);
+    tcg_gen_add2_tl(rhln, tmp, rhln, cnst_zero, tmp, cnst_zero);
+    tcg_gen_add2_tl(rhhn, tmp, rhhn, cnst_zero, tmp, cnst_zero);
+
+    /* Move conditionally result or -result depending on result sign */
+    tcg_gen_movcond_tl(TCG_COND_NE, rhl, sgnres, cnst_zero, rhln, rhl);
+    tcg_gen_movcond_tl(TCG_COND_NE, rhh, sgnres, cnst_zero, rhhn, rhh);
+
+    tcg_temp_free(rll);
+    tcg_temp_free(rlh);
+    tcg_temp_free(rlln);
+    tcg_temp_free(rlhn);
+    tcg_temp_free(rhln);
+    tcg_temp_free(rhhn);
+    tcg_temp_free(sgnres);
+    tcg_temp_free(tmp);
 }

You don't need to compute abs or conditional negation.

See tcg_gen_muls2_i32, adjust for negative inputs. It's simply subtracting one input from the high part when the other input is negative.

+static void gen_mulhsu_i128(TCGv rhl, TCGv rhh,
+                            TCGv rs1l, TCGv rs1h, TCGv rs2l, TCGv rs2h)

Similarly, but of course only one operand may be negative.

+static void gen_div_i128(TCGv rdl, TCGv rdh,
+                         TCGv rs1l, TCGv rs1h, TCGv rs2l, TCGv rs2h)
+{
+    gen_helper_divs_i128(cpu_env, (TCGv_i64)rs1l, (TCGv_i64)rs1h,
+                                  (TCGv_i64)rs2l, (TCGv_i64)rs2h);

Do not cast, just make the arguments target_long always.

Anyway, per above, this becomes

    gen_helper_divs_i128(rdl, cpu_env, rs1l, rs1h, rs2l, rs2h);
    tcg_gen_ld_tl(rdh, cpu_env, offsetof(CPURISCVState, retxh));


r~



reply via email to

[Prev in Thread] Current Thread [Next in Thread]