target-s390: Convert 64-bit MULTIPLY LOGICAL

Use a new "retxl" member of CPUS290XState to return the "eXtra Low" part
of a 128-bit value.  That said, this will get used when two independent
values need returning (e.g. quotient+remainder) as well.

At the same time, shuffle the elements of CPUS390XState to get this new
space from existing padding in the structure.

Signed-off-by: Richard Henderson <rth@twiddle.net>
diff --git a/target-s390x/cpu.h b/target-s390x/cpu.h
index 83e618a..afe33dc 100644
--- a/target-s390x/cpu.h
+++ b/target-s390x/cpu.h
@@ -60,17 +60,20 @@
 } ExtQueue;
 
 typedef struct CPUS390XState {
-    uint64_t regs[16];	/* GP registers */
-
-    uint32_t aregs[16];	/* access registers */
-
-    uint32_t fpc;	/* floating-point control register */
+    uint64_t regs[16];     /* GP registers */
     CPU_DoubleU fregs[16]; /* FP registers */
+    uint32_t aregs[16];    /* access registers */
+
+    uint32_t fpc;          /* floating-point control register */
+    uint32_t cc_op;
+
     float_status fpu_status; /* passed to softfloat lib */
 
+    /* The low part of a 128-bit return, or remainder of a divide.  */
+    uint64_t retxl;
+
     PSW psw;
 
-    uint32_t cc_op;
     uint64_t cc_src;
     uint64_t cc_dst;
     uint64_t cc_vr;
@@ -86,8 +89,8 @@
 
     uint64_t cregs[16]; /* control registers */
 
-    int pending_int;
     ExtQueue ext_queue[MAX_EXT_QUEUE];
+    int pending_int;
 
     int ext_index;
 
diff --git a/target-s390x/helper.h b/target-s390x/helper.h
index 2498f83..88a065c 100644
--- a/target-s390x/helper.h
+++ b/target-s390x/helper.h
@@ -9,7 +9,7 @@
 DEF_HELPER_3(mvcl, i32, env, i32, i32)
 DEF_HELPER_4(clm, i32, env, i32, i32, i64)
 DEF_HELPER_4(stcm, void, env, i32, i32, i64)
-DEF_HELPER_3(mlg, void, env, i32, i64)
+DEF_HELPER_FLAGS_3(mul128, TCG_CALL_NO_RWG, i64, env, i64, i64)
 DEF_HELPER_3(dlg, void, env, i32, i64)
 DEF_HELPER_4(srst, i32, env, i32, i32, i32)
 DEF_HELPER_4(clst, i32, env, i32, i32, i32)
diff --git a/target-s390x/insn-data.def b/target-s390x/insn-data.def
index ca12e47..94cd220 100644
--- a/target-s390x/insn-data.def
+++ b/target-s390x/insn-data.def
@@ -54,6 +54,8 @@
 /* MULTIPLY LOGICAL */
     C(0xb996, MLR,     RRE,   Z,   r1p1_32u, r2_32u, new, r1_D32, mul, 0)
     C(0xe396, ML,      RXY_a, Z,   r1p1_32u, m2_32u, new, r1_D32, mul, 0)
+    C(0xb986, MLGR,    RRE,   Z,   r1p1, r2_o, r1_P, 0, mul128, 0)
+    C(0xe386, MLG,     RXY_a, Z,   r1p1, m2_64, r1_P, 0, mul128, 0)
 /* MULTIPLY SINGLE */
     C(0xb252, MSR,     RRE,   Z,   r1_o, r2_o, new, r1_32, mul, 0)
     C(0x7100, MS,      RX_a,  Z,   r1_o, m2_32s, new, r1_32, mul, 0)
diff --git a/target-s390x/int_helper.c b/target-s390x/int_helper.c
index b683709..4f18d29 100644
--- a/target-s390x/int_helper.c
+++ b/target-s390x/int_helper.c
@@ -30,18 +30,11 @@
 #endif
 
 /* 64/64 -> 128 unsigned multiplication */
-void HELPER(mlg)(CPUS390XState *env, uint32_t r1, uint64_t v2)
+uint64_t HELPER(mul128)(CPUS390XState *env, uint64_t v1, uint64_t v2)
 {
-#if HOST_LONG_BITS == 64 && defined(__GNUC__)
-    /* assuming 64-bit hosts have __uint128_t */
-    __uint128_t res = (__uint128_t)env->regs[r1 + 1];
-
-    res *= (__uint128_t)v2;
-    env->regs[r1] = (uint64_t)(res >> 64);
-    env->regs[r1 + 1] = (uint64_t)res;
-#else
-    mulu64(&env->regs[r1 + 1], &env->regs[r1], env->regs[r1 + 1], v2);
-#endif
+    uint64_t reth;
+    mulu64(&env->retxl, &reth, v1, v2);
+    return reth;
 }
 
 /* 128 -> 64/64 unsigned division */
diff --git a/target-s390x/translate.c b/target-s390x/translate.c
index a08d471..c38dde8 100644
--- a/target-s390x/translate.c
+++ b/target-s390x/translate.c
@@ -293,6 +293,11 @@
 #endif
 }
 
+static inline void return_low128(TCGv_i64 dest)
+{
+    tcg_gen_ld_i64(dest, cpu_env, offsetof(CPUS390XState, retxl));
+}
+
 static inline void update_psw_addr(DisasContext *s)
 {
     /* psw.addr */
@@ -1563,14 +1568,6 @@
         set_cc_nz_u64(s, regs[r1]);
         tcg_temp_free_i64(tmp3);
         break;
-    case 0x86: /* MLG      R1,D2(X2,B2)     [RXY] */
-        tmp2 = tcg_temp_new_i64();
-        tmp32_1 = tcg_const_i32(r1);
-        tcg_gen_qemu_ld64(tmp2, addr, get_mem_index(s));
-        gen_helper_mlg(cpu_env, tmp32_1, tmp2);
-        tcg_temp_free_i64(tmp2);
-        tcg_temp_free_i32(tmp32_1);
-        break;
     case 0x87: /* DLG      R1,D2(X2,B2)     [RXY] */
         tmp2 = tcg_temp_new_i64();
         tmp32_1 = tcg_const_i32(r1);
@@ -4732,6 +4729,13 @@
     return NO_EXIT;
 }
 
+static ExitStatus op_mul128(DisasContext *s, DisasOps *o)
+{
+    gen_helper_mul128(o->out, cpu_env, o->in1, o->in2);
+    return_low128(o->out2);
+    return NO_EXIT;
+}
+
 static ExitStatus op_sub(DisasContext *s, DisasOps *o)
 {
     tcg_gen_sub_i64(o->out, o->in1, o->in2);
@@ -4800,6 +4804,15 @@
     o->g_out = true;
 }
 
+static void prep_r1_P(DisasContext *s, DisasFields *f, DisasOps *o)
+{
+    /* ??? Specification exception: r1 must be even.  */
+    int r1 = get_field(f, r1);
+    o->out = regs[r1];
+    o->out2 = regs[(r1 + 1) & 15];
+    o->g_out = o->g_out2 = true;
+}
+
 /* ====================================================================== */
 /* The "Write OUTput" generators.  These generally perform some non-trivial
    copy of data to TCG globals, or to main memory.  The trivial cases are
@@ -4844,6 +4857,13 @@
     o->g_in1 = true;
 }
 
+static void in1_r1p1(DisasContext *s, DisasFields *f, DisasOps *o)
+{
+    /* ??? Specification exception: r1 must be even.  */
+    int r1 = get_field(f, r1);
+    o->in1 = load_reg((r1 + 1) & 15);
+}
+
 static void in1_r1p1_32s(DisasContext *s, DisasFields *f, DisasOps *o)
 {
     /* ??? Specification exception: r1 must be even.  */