target/loongarch: Implement vreplve vpack vpick
This patch includes:
- VREPLVE[I].{B/H/W/D};
- VBSLL.V, VBSRL.V;
- VPACK{EV/OD}.{B/H/W/D};
- VPICK{EV/OD}.{B/H/W/D}.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Song Gao <gaosong@loongson.cn>
Message-Id: <20230504122810.4094787-40-gaosong@loongson.cn>
diff --git a/target/loongarch/disas.c b/target/loongarch/disas.c
index 7255a2a..c6cf782 100644
--- a/target/loongarch/disas.c
+++ b/target/loongarch/disas.c
@@ -833,6 +833,11 @@
output(ctx, mnemonic, "v%d, r%d", a->vd, a->rj);
}
+static void output_vvr(DisasContext *ctx, arg_vvr *a, const char *mnemonic)
+{
+ output(ctx, mnemonic, "v%d, v%d, r%d", a->vd, a->vj, a->rk);
+}
+
INSN_LSX(vadd_b, vvv)
INSN_LSX(vadd_h, vvv)
INSN_LSX(vadd_w, vvv)
@@ -1594,3 +1599,33 @@
INSN_LSX(vreplgr2vr_h, vr)
INSN_LSX(vreplgr2vr_w, vr)
INSN_LSX(vreplgr2vr_d, vr)
+
+INSN_LSX(vreplve_b, vvr)
+INSN_LSX(vreplve_h, vvr)
+INSN_LSX(vreplve_w, vvr)
+INSN_LSX(vreplve_d, vvr)
+INSN_LSX(vreplvei_b, vv_i)
+INSN_LSX(vreplvei_h, vv_i)
+INSN_LSX(vreplvei_w, vv_i)
+INSN_LSX(vreplvei_d, vv_i)
+
+INSN_LSX(vbsll_v, vv_i)
+INSN_LSX(vbsrl_v, vv_i)
+
+INSN_LSX(vpackev_b, vvv)
+INSN_LSX(vpackev_h, vvv)
+INSN_LSX(vpackev_w, vvv)
+INSN_LSX(vpackev_d, vvv)
+INSN_LSX(vpackod_b, vvv)
+INSN_LSX(vpackod_h, vvv)
+INSN_LSX(vpackod_w, vvv)
+INSN_LSX(vpackod_d, vvv)
+
+INSN_LSX(vpickev_b, vvv)
+INSN_LSX(vpickev_h, vvv)
+INSN_LSX(vpickev_w, vvv)
+INSN_LSX(vpickev_d, vvv)
+INSN_LSX(vpickod_b, vvv)
+INSN_LSX(vpickod_h, vvv)
+INSN_LSX(vpickod_w, vvv)
+INSN_LSX(vpickod_d, vvv)
diff --git a/target/loongarch/helper.h b/target/loongarch/helper.h
index 8eb2738..51ad694 100644
--- a/target/loongarch/helper.h
+++ b/target/loongarch/helper.h
@@ -653,3 +653,21 @@
DEF_HELPER_3(vsetallnez_h, void, env, i32, i32)
DEF_HELPER_3(vsetallnez_w, void, env, i32, i32)
DEF_HELPER_3(vsetallnez_d, void, env, i32, i32)
+
+DEF_HELPER_4(vpackev_b, void, env, i32, i32, i32)
+DEF_HELPER_4(vpackev_h, void, env, i32, i32, i32)
+DEF_HELPER_4(vpackev_w, void, env, i32, i32, i32)
+DEF_HELPER_4(vpackev_d, void, env, i32, i32, i32)
+DEF_HELPER_4(vpackod_b, void, env, i32, i32, i32)
+DEF_HELPER_4(vpackod_h, void, env, i32, i32, i32)
+DEF_HELPER_4(vpackod_w, void, env, i32, i32, i32)
+DEF_HELPER_4(vpackod_d, void, env, i32, i32, i32)
+
+DEF_HELPER_4(vpickev_b, void, env, i32, i32, i32)
+DEF_HELPER_4(vpickev_h, void, env, i32, i32, i32)
+DEF_HELPER_4(vpickev_w, void, env, i32, i32, i32)
+DEF_HELPER_4(vpickev_d, void, env, i32, i32, i32)
+DEF_HELPER_4(vpickod_b, void, env, i32, i32, i32)
+DEF_HELPER_4(vpickod_h, void, env, i32, i32, i32)
+DEF_HELPER_4(vpickod_w, void, env, i32, i32, i32)
+DEF_HELPER_4(vpickod_d, void, env, i32, i32, i32)
diff --git a/target/loongarch/insn_trans/trans_lsx.c.inc b/target/loongarch/insn_trans/trans_lsx.c.inc
index e722b79..1146ace 100644
--- a/target/loongarch/insn_trans/trans_lsx.c.inc
+++ b/target/loongarch/insn_trans/trans_lsx.c.inc
@@ -3933,3 +3933,147 @@
TRANS(vreplgr2vr_h, gvec_dup, MO_16)
TRANS(vreplgr2vr_w, gvec_dup, MO_32)
TRANS(vreplgr2vr_d, gvec_dup, MO_64)
+
+static bool trans_vreplvei_b(DisasContext *ctx, arg_vv_i *a)
+{
+ CHECK_SXE;
+ tcg_gen_gvec_dup_mem(MO_8,vec_full_offset(a->vd),
+ offsetof(CPULoongArchState,
+ fpr[a->vj].vreg.B((a->imm))),
+ 16, ctx->vl/8);
+ return true;
+}
+
+static bool trans_vreplvei_h(DisasContext *ctx, arg_vv_i *a)
+{
+ CHECK_SXE;
+ tcg_gen_gvec_dup_mem(MO_16, vec_full_offset(a->vd),
+ offsetof(CPULoongArchState,
+ fpr[a->vj].vreg.H((a->imm))),
+ 16, ctx->vl/8);
+ return true;
+}
+static bool trans_vreplvei_w(DisasContext *ctx, arg_vv_i *a)
+{
+ CHECK_SXE;
+ tcg_gen_gvec_dup_mem(MO_32, vec_full_offset(a->vd),
+ offsetof(CPULoongArchState,
+ fpr[a->vj].vreg.W((a->imm))),
+ 16, ctx->vl/8);
+ return true;
+}
+static bool trans_vreplvei_d(DisasContext *ctx, arg_vv_i *a)
+{
+ CHECK_SXE;
+ tcg_gen_gvec_dup_mem(MO_64, vec_full_offset(a->vd),
+ offsetof(CPULoongArchState,
+ fpr[a->vj].vreg.D((a->imm))),
+ 16, ctx->vl/8);
+ return true;
+}
+
+static bool gen_vreplve(DisasContext *ctx, arg_vvr *a, int vece, int bit,
+ void (*func)(TCGv_i64, TCGv_ptr, tcg_target_long))
+{
+ TCGv_i64 t0 = tcg_temp_new_i64();
+ TCGv_ptr t1 = tcg_temp_new_ptr();
+ TCGv_i64 t2 = tcg_temp_new_i64();
+
+ CHECK_SXE;
+
+ tcg_gen_andi_i64(t0, gpr_src(ctx, a->rk, EXT_NONE), (LSX_LEN/bit) -1);
+ tcg_gen_shli_i64(t0, t0, vece);
+ if (HOST_BIG_ENDIAN) {
+ tcg_gen_xori_i64(t0, t0, vece << ((LSX_LEN/bit) -1));
+ }
+
+ tcg_gen_trunc_i64_ptr(t1, t0);
+ tcg_gen_add_ptr(t1, t1, cpu_env);
+ func(t2, t1, vec_full_offset(a->vj));
+ tcg_gen_gvec_dup_i64(vece, vec_full_offset(a->vd), 16, ctx->vl/8, t2);
+
+ return true;
+}
+
+TRANS(vreplve_b, gen_vreplve, MO_8, 8, tcg_gen_ld8u_i64)
+TRANS(vreplve_h, gen_vreplve, MO_16, 16, tcg_gen_ld16u_i64)
+TRANS(vreplve_w, gen_vreplve, MO_32, 32, tcg_gen_ld32u_i64)
+TRANS(vreplve_d, gen_vreplve, MO_64, 64, tcg_gen_ld_i64)
+
+static bool trans_vbsll_v(DisasContext *ctx, arg_vv_i *a)
+{
+ int ofs;
+ TCGv_i64 desthigh, destlow, high, low;
+
+ CHECK_SXE;
+
+ desthigh = tcg_temp_new_i64();
+ destlow = tcg_temp_new_i64();
+ high = tcg_temp_new_i64();
+ low = tcg_temp_new_i64();
+
+ get_vreg64(low, a->vj, 0);
+
+ ofs = ((a->imm) & 0xf) * 8;
+ if (ofs < 64) {
+ get_vreg64(high, a->vj, 1);
+ tcg_gen_extract2_i64(desthigh, low, high, 64 - ofs);
+ tcg_gen_shli_i64(destlow, low, ofs);
+ } else {
+ tcg_gen_shli_i64(desthigh, low, ofs - 64);
+ destlow = tcg_constant_i64(0);
+ }
+
+ set_vreg64(desthigh, a->vd, 1);
+ set_vreg64(destlow, a->vd, 0);
+
+ return true;
+}
+
+static bool trans_vbsrl_v(DisasContext *ctx, arg_vv_i *a)
+{
+ TCGv_i64 desthigh, destlow, high, low;
+ int ofs;
+
+ CHECK_SXE;
+
+ desthigh = tcg_temp_new_i64();
+ destlow = tcg_temp_new_i64();
+ high = tcg_temp_new_i64();
+ low = tcg_temp_new_i64();
+
+ get_vreg64(high, a->vj, 1);
+
+ ofs = ((a->imm) & 0xf) * 8;
+ if (ofs < 64) {
+ get_vreg64(low, a->vj, 0);
+ tcg_gen_extract2_i64(destlow, low, high, ofs);
+ tcg_gen_shri_i64(desthigh, high, ofs);
+ } else {
+ tcg_gen_shri_i64(destlow, high, ofs - 64);
+ desthigh = tcg_constant_i64(0);
+ }
+
+ set_vreg64(desthigh, a->vd, 1);
+ set_vreg64(destlow, a->vd, 0);
+
+ return true;
+}
+
+TRANS(vpackev_b, gen_vvv, gen_helper_vpackev_b)
+TRANS(vpackev_h, gen_vvv, gen_helper_vpackev_h)
+TRANS(vpackev_w, gen_vvv, gen_helper_vpackev_w)
+TRANS(vpackev_d, gen_vvv, gen_helper_vpackev_d)
+TRANS(vpackod_b, gen_vvv, gen_helper_vpackod_b)
+TRANS(vpackod_h, gen_vvv, gen_helper_vpackod_h)
+TRANS(vpackod_w, gen_vvv, gen_helper_vpackod_w)
+TRANS(vpackod_d, gen_vvv, gen_helper_vpackod_d)
+
+TRANS(vpickev_b, gen_vvv, gen_helper_vpickev_b)
+TRANS(vpickev_h, gen_vvv, gen_helper_vpickev_h)
+TRANS(vpickev_w, gen_vvv, gen_helper_vpickev_w)
+TRANS(vpickev_d, gen_vvv, gen_helper_vpickev_d)
+TRANS(vpickod_b, gen_vvv, gen_helper_vpickod_b)
+TRANS(vpickod_h, gen_vvv, gen_helper_vpickod_h)
+TRANS(vpickod_w, gen_vvv, gen_helper_vpickod_w)
+TRANS(vpickod_d, gen_vvv, gen_helper_vpickod_d)
diff --git a/target/loongarch/insns.decode b/target/loongarch/insns.decode
index d1d255a..ab9e9e4 100644
--- a/target/loongarch/insns.decode
+++ b/target/loongarch/insns.decode
@@ -499,6 +499,7 @@
&vr_i vd rj imm
&rv_i rd vj imm
&vr vd rj
+&vvr vd vj rk
#
# LSX Formats
@@ -506,6 +507,8 @@
@vv .... ........ ..... ..... vj:5 vd:5 &vv
@cv .... ........ ..... ..... vj:5 .. cd:3 &cv
@vvv .... ........ ..... vk:5 vj:5 vd:5 &vvv
+@vv_ui1 .... ........ ..... .... imm:1 vj:5 vd:5 &vv_i
+@vv_ui2 .... ........ ..... ... imm:2 vj:5 vd:5 &vv_i
@vv_ui3 .... ........ ..... .. imm:3 vj:5 vd:5 &vv_i
@vv_ui4 .... ........ ..... . imm:4 vj:5 vd:5 &vv_i
@vv_ui5 .... ........ ..... imm:5 vj:5 vd:5 &vv_i
@@ -524,6 +527,7 @@
@rv_ui2 .... ........ ..... ... imm:2 vj:5 rd:5 &rv_i
@rv_ui1 .... ........ ..... .... imm:1 vj:5 rd:5 &rv_i
@vr .... ........ ..... ..... rj:5 vd:5 &vr
+@vvr .... ........ ..... rk:5 vj:5 vd:5 &vvr
vadd_b 0111 00000000 10100 ..... ..... ..... @vvv
vadd_h 0111 00000000 10101 ..... ..... ..... @vvv
@@ -1197,3 +1201,33 @@
vreplgr2vr_h 0111 00101001 11110 00001 ..... ..... @vr
vreplgr2vr_w 0111 00101001 11110 00010 ..... ..... @vr
vreplgr2vr_d 0111 00101001 11110 00011 ..... ..... @vr
+
+vreplve_b 0111 00010010 00100 ..... ..... ..... @vvr
+vreplve_h 0111 00010010 00101 ..... ..... ..... @vvr
+vreplve_w 0111 00010010 00110 ..... ..... ..... @vvr
+vreplve_d 0111 00010010 00111 ..... ..... ..... @vvr
+vreplvei_b 0111 00101111 01111 0 .... ..... ..... @vv_ui4
+vreplvei_h 0111 00101111 01111 10 ... ..... ..... @vv_ui3
+vreplvei_w 0111 00101111 01111 110 .. ..... ..... @vv_ui2
+vreplvei_d 0111 00101111 01111 1110 . ..... ..... @vv_ui1
+
+vbsll_v 0111 00101000 11100 ..... ..... ..... @vv_ui5
+vbsrl_v 0111 00101000 11101 ..... ..... ..... @vv_ui5
+
+vpackev_b 0111 00010001 01100 ..... ..... ..... @vvv
+vpackev_h 0111 00010001 01101 ..... ..... ..... @vvv
+vpackev_w 0111 00010001 01110 ..... ..... ..... @vvv
+vpackev_d 0111 00010001 01111 ..... ..... ..... @vvv
+vpackod_b 0111 00010001 10000 ..... ..... ..... @vvv
+vpackod_h 0111 00010001 10001 ..... ..... ..... @vvv
+vpackod_w 0111 00010001 10010 ..... ..... ..... @vvv
+vpackod_d 0111 00010001 10011 ..... ..... ..... @vvv
+
+vpickev_b 0111 00010001 11100 ..... ..... ..... @vvv
+vpickev_h 0111 00010001 11101 ..... ..... ..... @vvv
+vpickev_w 0111 00010001 11110 ..... ..... ..... @vvv
+vpickev_d 0111 00010001 11111 ..... ..... ..... @vvv
+vpickod_b 0111 00010010 00000 ..... ..... ..... @vvv
+vpickod_h 0111 00010010 00001 ..... ..... ..... @vvv
+vpickod_w 0111 00010010 00010 ..... ..... ..... @vvv
+vpickod_d 0111 00010010 00011 ..... ..... ..... @vvv
diff --git a/target/loongarch/lsx_helper.c b/target/loongarch/lsx_helper.c
index 51c4109..84b8f72 100644
--- a/target/loongarch/lsx_helper.c
+++ b/target/loongarch/lsx_helper.c
@@ -2766,3 +2766,91 @@
SETALLNEZ(vsetallnez_h, MO_16)
SETALLNEZ(vsetallnez_w, MO_32)
SETALLNEZ(vsetallnez_d, MO_64)
+
+#define VPACKEV(NAME, BIT, E) \
+void HELPER(NAME)(CPULoongArchState *env, \
+ uint32_t vd, uint32_t vj, uint32_t vk) \
+{ \
+ int i; \
+ VReg temp; \
+ VReg *Vd = &(env->fpr[vd].vreg); \
+ VReg *Vj = &(env->fpr[vj].vreg); \
+ VReg *Vk = &(env->fpr[vk].vreg); \
+ \
+ for (i = 0; i < LSX_LEN/BIT; i++) { \
+ temp.E(2 * i + 1) = Vj->E(2 * i); \
+ temp.E(2 *i) = Vk->E(2 * i); \
+ } \
+ *Vd = temp; \
+}
+
+VPACKEV(vpackev_b, 16, B)
+VPACKEV(vpackev_h, 32, H)
+VPACKEV(vpackev_w, 64, W)
+VPACKEV(vpackev_d, 128, D)
+
+#define VPACKOD(NAME, BIT, E) \
+void HELPER(NAME)(CPULoongArchState *env, \
+ uint32_t vd, uint32_t vj, uint32_t vk) \
+{ \
+ int i; \
+ VReg temp; \
+ VReg *Vd = &(env->fpr[vd].vreg); \
+ VReg *Vj = &(env->fpr[vj].vreg); \
+ VReg *Vk = &(env->fpr[vk].vreg); \
+ \
+ for (i = 0; i < LSX_LEN/BIT; i++) { \
+ temp.E(2 * i + 1) = Vj->E(2 * i + 1); \
+ temp.E(2 * i) = Vk->E(2 * i + 1); \
+ } \
+ *Vd = temp; \
+}
+
+VPACKOD(vpackod_b, 16, B)
+VPACKOD(vpackod_h, 32, H)
+VPACKOD(vpackod_w, 64, W)
+VPACKOD(vpackod_d, 128, D)
+
+#define VPICKEV(NAME, BIT, E) \
+void HELPER(NAME)(CPULoongArchState *env, \
+ uint32_t vd, uint32_t vj, uint32_t vk) \
+{ \
+ int i; \
+ VReg temp; \
+ VReg *Vd = &(env->fpr[vd].vreg); \
+ VReg *Vj = &(env->fpr[vj].vreg); \
+ VReg *Vk = &(env->fpr[vk].vreg); \
+ \
+ for (i = 0; i < LSX_LEN/BIT; i++) { \
+ temp.E(i + LSX_LEN/BIT) = Vj->E(2 * i); \
+ temp.E(i) = Vk->E(2 * i); \
+ } \
+ *Vd = temp; \
+}
+
+VPICKEV(vpickev_b, 16, B)
+VPICKEV(vpickev_h, 32, H)
+VPICKEV(vpickev_w, 64, W)
+VPICKEV(vpickev_d, 128, D)
+
+#define VPICKOD(NAME, BIT, E) \
+void HELPER(NAME)(CPULoongArchState *env, \
+ uint32_t vd, uint32_t vj, uint32_t vk) \
+{ \
+ int i; \
+ VReg temp; \
+ VReg *Vd = &(env->fpr[vd].vreg); \
+ VReg *Vj = &(env->fpr[vj].vreg); \
+ VReg *Vk = &(env->fpr[vk].vreg); \
+ \
+ for (i = 0; i < LSX_LEN/BIT; i++) { \
+ temp.E(i + LSX_LEN/BIT) = Vj->E(2 * i + 1); \
+ temp.E(i) = Vk->E(2 * i + 1); \
+ } \
+ *Vd = temp; \
+}
+
+VPICKOD(vpickod_b, 16, B)
+VPICKOD(vpickod_h, 32, H)
+VPICKOD(vpickod_w, 64, W)
+VPICKOD(vpickod_d, 128, D)