qemu-riscv
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v5 24/60] target/riscv: vector single-width averaging add and


From: Richard Henderson
Subject: Re: [PATCH v5 24/60] target/riscv: vector single-width averaging add and subtract
Date: Sat, 14 Mar 2020 18:00:58 -0700
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Thunderbird/68.4.1

On 3/14/20 4:12 PM, LIU Zhiwei wrote:
> I am not sure whether I get it. In my opinion, the code should be modified 
> like
> 
> static inline int8_t aadd8_rnu(CPURISCVState *env, int8_t a, int8_t b)
> {
>     int16_t res = (int16_t)a + (int16_t)b;
>     uint8_t round = res & 0x1;
>     res   = (res >> 1) + round;
>     return res;
> }
> 
> static inline int8_t aadd8_rne(CPURISCVState *env, int8_t a, int8_t b)
> {
>     int16_t res = (int16_t)a + (int16_t)b;
>     uint8_t round = ((res & 0x3) == 0x3);
>     res   = (res >> 1) + round;
>     return res;
> }
> 
> static inline int8_t aadd8_rdn(CPURISCVState *env, int8_t a, int8_t b)
> {
>     int16_t res = (int16_t)a + (int16_t)b;
>     res   = (res >> 1);
>     return res;
> }
> 
> static inline int8_t aadd8_rod(CPURISCVState *env, int8_t a, int8_t b)
> {
>     int16_t res = (int16_t)a + (int16_t)b;
>     uint8_t round = ((res & 0x3) == 0x1);
>    res   = (res >> 1) + round;
>     return res;
> }
> 
> RVVCALL(OPIVV2_ENV, vaadd_vv_b_rnu, OP_SSS_B, H1, H1, H1, aadd8_rnu)
> RVVCALL(OPIVV2_ENV, vaadd_vv_b_rne, OP_SSS_B, H1, H1, H1, aadd8_rne)
> RVVCALL(OPIVV2_ENV, vaadd_vv_b_rdn, OP_SSS_B, H1, H1, H1, aadd8_rdn)
> RVVCALL(OPIVV2_ENV, vaadd_vv_b_rod, OP_SSS_B, H1, H1, H1, aadd8_rod)
> 
> void do_vext_vv_env(void *vd, void *v0, void *vs1,
>                     void *vs2, CPURISCVState *env, uint32_t desc,
>                     uint32_t esz, uint32_t dsz,
>                     opivv2_fn *fn, clear_fn *clearfn)
> {
>     uint32_t vlmax = vext_maxsz(desc) / esz;
>     uint32_t mlen = vext_mlen(desc);
>     uint32_t vm = vext_vm(desc);
>     uint32_t vl = env->vl;
>     uint32_t i;
>     for (i = 0; i < vl; i++) {
>         if (!vm && !vext_elem_mask(v0, mlen, i)) {
>             continue;
>         }
>         fn(vd, vs1, vs2, i, env);
>     }
>     if (i != 0) {
>         clear_fn(vd, vl, vl * dsz,  vlmax * dsz);
>     }
> }
> 
> #define GEN_VEXT_VV_ENV(NAME, ESZ, DSZ, CLEAR_FN)         \
> void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
>                   void *vs2, CPURISCVState *env,          \
>                   uint32_t desc)                          \
> {                                                         \
>     static opivv2_fn *fns[4] = {                          \
>         NAME##_rnu, NAME##_rne,                           \
>         NAME##_rdn, NAME##_rod                            \
>     }                                                     \
>     return do_vext_vv_env(vd, v0, vs1, vs2, env, desc,    \
>                           ESZ, DSZ, fns[env->vxrm],       \
>               CLEAR_FN);                      \
> }
> 
> Is it true?

While that does look good for this case, there are many other uses of
get_round(), and it may not be quite as simple there.

My suggestion was

static inline int32_t aadd32(int vxrm, int32_t a, int32_t b)
{
    int64_t res = (int64_t)a + b;
    uint8_t round = get_round(vxrm, res, 1);

    return (res >> 1) + round;
}

static inline int64_t aadd64(int vxrm, int64_t a, int64_t b)
{
    int64_t res = a + b;
    uint8_t round = get_round(vxrm, res, 1);
    int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;

    /* With signed overflow, bit 64 is inverse of bit 63. */
    return ((res >> 1) ^ over) + round;
}

RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)

static inline void
vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
             uint32_t vl, uint32_t vm, uint32_t mlen, int vxrm,
             opivv2_rm_fn *fn)
{
    for (uint32_t i = 0; i < vl; i++) {
        if (!vm && !vext_elem_mask(v0, mlen, i)) {
            continue;
        }
        fn(vd, vs1, vs2, i, vxrm);
    }
}

static inline void
vext_vv_rm_2(void *vd, void *v0, void *vs1,
             void *vs2, CPURISCVState *env, uint32_t desc,
             uint32_t esz, uint32_t dsz,
             opivv2_rm_fn *fn, clear_fn *clearfn)
{
    uint32_t vlmax = vext_maxsz(desc) / esz;
    uint32_t mlen = vext_mlen(desc);
    uint32_t vm = vext_vm(desc);
    uint32_t vl = env->vl;

    if (vl == 0) {
        return;
    }

    switch (env->vxrm) {
    case 0: /* rnu */
        vext_vv_rm_1(vd, v0, vs1, vs2,
                     vl, vm, mlen, 0, fn);
        break;
    case 1: /* rne */
        vext_vv_rm_1(vd, v0, vs1, vs2,
                     vl, vm, mlen, 1, fn);
        break;
    case 2: /* rdn */
        vext_vv_rm_1(vd, v0, vs1, vs2,
                     vl, vm, mlen, 2, fn);
        break;
    default: /* rod */
        vext_vv_rm_1(vd, v0, vs1, vs2,
                     vl, vm, mlen, 3, fn);
        break;
    }

    clear_fn(vd, vl, vl * dsz,  vlmax * dsz);
}

>From vext_vv_rm_2, a constant is passed down all of the inline functions, so
that a constant arrives in get_round() at the bottom of the call chain.  At
which point all of the expressions get folded by the compiler and we *should*
get very similar generated code as to what you have above.


r~



reply via email to

[Prev in Thread] Current Thread [Next in Thread]