diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -25597,10 +25597,25 @@ return N0; // If this is an insert of an extracted vector into an undef vector, we can - // just use the input to the extract. + // just use the input to the extract if the types match, and can simplify + // in some cases even if they don't.. if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR && - N1.getOperand(1) == N2 && N1.getOperand(0).getValueType() == VT) - return N1.getOperand(0); + N1.getOperand(1) == N2) { + EVT SrcVT = N1.getOperand(0).getValueType(); + if (SrcVT == VT) + return N1.getOperand(0); + // TODO: To remove the zero check, need to adjust the offset to + // a multiple of the new src type. + if (isNullConstant(N2) && + VT.isScalableVector() == SrcVT.isScalableVector()) { + if (VT.getVectorMinNumElements() >= SrcVT.getVectorMinNumElements()) + return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), + VT, N0, N1.getOperand(0), N2); + else + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), + VT, N1.getOperand(0), N2); + } + } // Simplify scalar inserts into an undef vector: // insert_subvector undef, (splat X), N2 -> splat X diff --git a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll --- a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll @@ -103,15 +103,15 @@ define <32 x i1> @fv32(ptr %p, i64 %index, i64 %tc) { ; CHECK-LABEL: fv32: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: lui a0, %hi(.LCPI8_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI8_0) -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vid.v v16 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v0, v16, a2 ; CHECK-NEXT: vsaddu.vx v8, v8, a1 ; CHECK-NEXT: vmsltu.vx v16, v8, a2 -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v0, v8, a2 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vslideup.vi v0, v16, 2 ; CHECK-NEXT: ret @@ -122,15 +122,15 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) { ; CHECK-LABEL: fv64: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: lui a0, %hi(.LCPI9_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_0) -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vid.v v16 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v0, v16, a2 ; CHECK-NEXT: vsaddu.vx v8, v8, a1 ; CHECK-NEXT: vmsltu.vx v16, v8, a2 -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v0, v8, a2 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma ; CHECK-NEXT: vslideup.vi v0, v16, 2 ; CHECK-NEXT: lui a0, %hi(.LCPI9_1) @@ -157,15 +157,15 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) { ; CHECK-LABEL: fv128: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: lui a0, %hi(.LCPI10_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_0) -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vid.v v16 +; CHECK-NEXT: vsaddu.vx v16, v16, a1 +; CHECK-NEXT: vmsltu.vx v0, v16, a2 ; CHECK-NEXT: vsaddu.vx v8, v8, a1 ; CHECK-NEXT: vmsltu.vx v16, v8, a2 -; CHECK-NEXT: vid.v v8 -; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vmsltu.vx v0, v8, a2 ; CHECK-NEXT: vsetivli zero, 4, e8, m1, tu, ma ; CHECK-NEXT: vslideup.vi v0, v16, 2 ; CHECK-NEXT: lui a0, %hi(.LCPI10_1) diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll --- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll @@ -469,13 +469,14 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v13, v10, a0 ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v9, a0 +; CHECK-NEXT: vslidedown.vx v12, v9, a0 ; CHECK-NEXT: add a1, a0, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, ma -; CHECK-NEXT: vslideup.vx v8, v10, a0 -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma -; CHECK-NEXT: vslidedown.vx v9, v10, a0 +; CHECK-NEXT: vslideup.vx v12, v10, a0 +; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %res = call @llvm.vector.extract.nxv6f16.nxv12f16( %in, i64 6) ret %res diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll @@ -141,13 +141,12 @@ ; LMULMAX1-LABEL: sextload_v4i8_v4i64: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; LMULMAX1-NEXT: vle8.v v9, (a0) -; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf8 v8, v9 +; LMULMAX1-NEXT: vle8.v v10, (a0) ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2 +; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf8 v9, v10 +; LMULMAX1-NEXT: vsext.vf8 v9, v8 +; LMULMAX1-NEXT: vsext.vf8 v8, v10 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v4i8_v4i64: @@ -165,13 +164,12 @@ ; LMULMAX1-LABEL: zextload_v4i8_v4i64: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; LMULMAX1-NEXT: vle8.v v9, (a0) -; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf8 v8, v9 +; LMULMAX1-NEXT: vle8.v v10, (a0) ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2 +; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf8 v9, v10 +; LMULMAX1-NEXT: vzext.vf8 v9, v8 +; LMULMAX1-NEXT: vzext.vf8 v8, v10 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v4i8_v4i64: @@ -213,13 +211,12 @@ ; LMULMAX1-LABEL: sextload_v8i8_v8i32: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; LMULMAX1-NEXT: vle8.v v9, (a0) -; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf4 v8, v9 +; LMULMAX1-NEXT: vle8.v v10, (a0) ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v9, 4 +; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf4 v9, v10 +; LMULMAX1-NEXT: vsext.vf4 v9, v8 +; LMULMAX1-NEXT: vsext.vf4 v8, v10 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v8i8_v8i32: @@ -237,13 +234,12 @@ ; LMULMAX1-LABEL: zextload_v8i8_v8i32: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; LMULMAX1-NEXT: vle8.v v9, (a0) -; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf4 v8, v9 +; LMULMAX1-NEXT: vle8.v v10, (a0) ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v9, 4 +; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf4 v9, v10 +; LMULMAX1-NEXT: vzext.vf4 v9, v8 +; LMULMAX1-NEXT: vzext.vf4 v8, v10 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v8i8_v8i32: @@ -269,13 +265,13 @@ ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-NEXT: vsext.vf8 v10, v11 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v12, v9, 2 -; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf8 v9, v12 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-NEXT: vsext.vf8 v11, v12 +; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; LMULMAX1-NEXT: vslidedown.vi v12, v9, 2 +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vsext.vf8 v9, v12 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v8i8_v8i64: @@ -301,13 +297,13 @@ ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-NEXT: vzext.vf8 v10, v11 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v12, v9, 2 -; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf8 v9, v12 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-NEXT: vzext.vf8 v11, v12 +; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; LMULMAX1-NEXT: vslidedown.vi v12, v9, 2 +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vzext.vf8 v9, v12 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v8i8_v8i64: @@ -325,13 +321,12 @@ ; LMULMAX1-LABEL: sextload_v16i8_v16i16: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; LMULMAX1-NEXT: vle8.v v9, (a0) -; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v8, v9 +; LMULMAX1-NEXT: vle8.v v10, (a0) ; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v9, 8 +; LMULMAX1-NEXT: vslidedown.vi v8, v10, 8 ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v9, v10 +; LMULMAX1-NEXT: vsext.vf2 v9, v8 +; LMULMAX1-NEXT: vsext.vf2 v8, v10 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v16i8_v16i16: @@ -349,13 +344,12 @@ ; LMULMAX1-LABEL: zextload_v16i8_v16i16: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; LMULMAX1-NEXT: vle8.v v9, (a0) -; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v8, v9 +; LMULMAX1-NEXT: vle8.v v10, (a0) ; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v9, 8 +; LMULMAX1-NEXT: vslidedown.vi v8, v10, 8 ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v9, v10 +; LMULMAX1-NEXT: vzext.vf2 v9, v8 +; LMULMAX1-NEXT: vzext.vf2 v8, v10 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v16i8_v16i16: @@ -381,13 +375,13 @@ ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; LMULMAX1-NEXT: vsext.vf4 v10, v11 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v12, v9, 4 -; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf4 v9, v12 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; LMULMAX1-NEXT: vslidedown.vi v12, v11, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; LMULMAX1-NEXT: vsext.vf4 v11, v12 +; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; LMULMAX1-NEXT: vslidedown.vi v12, v9, 4 +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-NEXT: vsext.vf4 v9, v12 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v16i8_v16i32: @@ -413,13 +407,13 @@ ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; LMULMAX1-NEXT: vzext.vf4 v10, v11 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v12, v9, 4 -; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf4 v9, v12 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; LMULMAX1-NEXT: vslidedown.vi v12, v11, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; LMULMAX1-NEXT: vzext.vf4 v11, v12 +; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; LMULMAX1-NEXT: vslidedown.vi v12, v9, 4 +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-NEXT: vzext.vf4 v9, v12 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v16i8_v16i32: @@ -448,38 +442,37 @@ ; LMULMAX1-NEXT: vslidedown.vi v13, v10, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-NEXT: vsext.vf8 v9, v13 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v15, v10, 4 -; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf8 v10, v15 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; LMULMAX1-NEXT: vslidedown.vi v14, v11, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-NEXT: vsext.vf8 v13, v14 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v16, v11, 4 +; LMULMAX1-NEXT: vslidedown.vi v11, v11, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf8 v14, v16 +; LMULMAX1-NEXT: vsext.vf8 v14, v11 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v15, v15, 2 +; LMULMAX1-NEXT: vslidedown.vi v11, v11, 2 +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vsext.vf8 v15, v11 +; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; LMULMAX1-NEXT: vslidedown.vi v11, v10, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf8 v11, v15 +; LMULMAX1-NEXT: vsext.vf8 v10, v11 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v16, v16, 2 +; LMULMAX1-NEXT: vslidedown.vi v16, v11, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf8 v15, v16 +; LMULMAX1-NEXT: vsext.vf8 v11, v16 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v16i8_v16i64: ; LMULMAX4: # %bb.0: ; LMULMAX4-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; LMULMAX4-NEXT: vle8.v v12, (a0) -; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; LMULMAX4-NEXT: vsext.vf8 v8, v12 +; LMULMAX4-NEXT: vle8.v v16, (a0) ; LMULMAX4-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; LMULMAX4-NEXT: vslidedown.vi v16, v12, 8 +; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8 ; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; LMULMAX4-NEXT: vsext.vf8 v12, v16 +; LMULMAX4-NEXT: vsext.vf8 v12, v8 +; LMULMAX4-NEXT: vsext.vf8 v8, v16 ; LMULMAX4-NEXT: ret %y = load <16 x i8>, ptr %x %z = sext <16 x i8> %y to <16 x i64> @@ -501,38 +494,37 @@ ; LMULMAX1-NEXT: vslidedown.vi v13, v10, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-NEXT: vzext.vf8 v9, v13 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v15, v10, 4 -; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf8 v10, v15 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; LMULMAX1-NEXT: vslidedown.vi v14, v11, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-NEXT: vzext.vf8 v13, v14 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v16, v11, 4 +; LMULMAX1-NEXT: vslidedown.vi v11, v11, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf8 v14, v16 +; LMULMAX1-NEXT: vzext.vf8 v14, v11 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v15, v15, 2 +; LMULMAX1-NEXT: vslidedown.vi v11, v11, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf8 v11, v15 +; LMULMAX1-NEXT: vzext.vf8 v15, v11 +; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; LMULMAX1-NEXT: vslidedown.vi v11, v10, 4 +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vzext.vf8 v10, v11 ; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v16, v16, 2 +; LMULMAX1-NEXT: vslidedown.vi v16, v11, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf8 v15, v16 +; LMULMAX1-NEXT: vzext.vf8 v11, v16 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v16i8_v16i64: ; LMULMAX4: # %bb.0: ; LMULMAX4-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; LMULMAX4-NEXT: vle8.v v12, (a0) -; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; LMULMAX4-NEXT: vzext.vf8 v8, v12 +; LMULMAX4-NEXT: vle8.v v16, (a0) ; LMULMAX4-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; LMULMAX4-NEXT: vslidedown.vi v16, v12, 8 +; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8 ; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; LMULMAX4-NEXT: vzext.vf8 v12, v16 +; LMULMAX4-NEXT: vzext.vf8 v12, v8 +; LMULMAX4-NEXT: vzext.vf8 v8, v16 ; LMULMAX4-NEXT: ret %y = load <16 x i8>, ptr %x %z = zext <16 x i8> %y to <16 x i64> @@ -660,13 +652,12 @@ ; LMULMAX1-LABEL: sextload_v4i16_v4i64: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; LMULMAX1-NEXT: vle16.v v9, (a0) -; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf4 v8, v9 +; LMULMAX1-NEXT: vle16.v v10, (a0) ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2 +; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf4 v9, v10 +; LMULMAX1-NEXT: vsext.vf4 v9, v8 +; LMULMAX1-NEXT: vsext.vf4 v8, v10 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v4i16_v4i64: @@ -684,13 +675,12 @@ ; LMULMAX1-LABEL: zextload_v4i16_v4i64: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; LMULMAX1-NEXT: vle16.v v9, (a0) -; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf4 v8, v9 +; LMULMAX1-NEXT: vle16.v v10, (a0) ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2 +; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf4 v9, v10 +; LMULMAX1-NEXT: vzext.vf4 v9, v8 +; LMULMAX1-NEXT: vzext.vf4 v8, v10 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v4i16_v4i64: @@ -720,13 +710,12 @@ ; LMULMAX1-LABEL: sextload_v8i16_v8i32: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; LMULMAX1-NEXT: vle16.v v9, (a0) -; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v8, v9 +; LMULMAX1-NEXT: vle16.v v10, (a0) ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v9, 4 +; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v9, v10 +; LMULMAX1-NEXT: vsext.vf2 v9, v8 +; LMULMAX1-NEXT: vsext.vf2 v8, v10 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v8i16_v8i32: @@ -744,13 +733,12 @@ ; LMULMAX1-LABEL: zextload_v8i16_v8i32: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; LMULMAX1-NEXT: vle16.v v9, (a0) -; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v8, v9 +; LMULMAX1-NEXT: vle16.v v10, (a0) ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v9, 4 +; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v9, v10 +; LMULMAX1-NEXT: vzext.vf2 v9, v8 +; LMULMAX1-NEXT: vzext.vf2 v8, v10 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v8i16_v8i32: @@ -776,13 +764,13 @@ ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-NEXT: vsext.vf4 v10, v11 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v12, v9, 2 -; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf4 v9, v12 -; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-NEXT: vsext.vf4 v11, v12 +; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; LMULMAX1-NEXT: vslidedown.vi v12, v9, 2 +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vsext.vf4 v9, v12 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v8i16_v8i64: @@ -808,13 +796,13 @@ ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-NEXT: vzext.vf4 v10, v11 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v12, v9, 2 -; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf4 v9, v12 -; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-NEXT: vzext.vf4 v11, v12 +; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; LMULMAX1-NEXT: vslidedown.vi v12, v9, 2 +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vzext.vf4 v9, v12 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v8i16_v8i64: @@ -854,20 +842,19 @@ ; LMULMAX1-LABEL: sextload_v16i16_v16i32: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; LMULMAX1-NEXT: vle16.v v9, (a0) +; LMULMAX1-NEXT: vle16.v v10, (a0) ; LMULMAX1-NEXT: addi a0, a0, 16 -; LMULMAX1-NEXT: vle16.v v11, (a0) -; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v8, v9 +; LMULMAX1-NEXT: vle16.v v12, (a0) ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v9, 4 +; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v9, v10 -; LMULMAX1-NEXT: vsext.vf2 v10, v11 +; LMULMAX1-NEXT: vsext.vf2 v9, v8 +; LMULMAX1-NEXT: vsext.vf2 v8, v10 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v12, v11, 4 +; LMULMAX1-NEXT: vslidedown.vi v10, v12, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v11, v12 +; LMULMAX1-NEXT: vsext.vf2 v11, v10 +; LMULMAX1-NEXT: vsext.vf2 v10, v12 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v16i16_v16i32: @@ -885,20 +872,19 @@ ; LMULMAX1-LABEL: zextload_v16i16_v16i32: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; LMULMAX1-NEXT: vle16.v v9, (a0) +; LMULMAX1-NEXT: vle16.v v10, (a0) ; LMULMAX1-NEXT: addi a0, a0, 16 -; LMULMAX1-NEXT: vle16.v v11, (a0) -; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v8, v9 +; LMULMAX1-NEXT: vle16.v v12, (a0) ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v9, 4 +; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v9, v10 -; LMULMAX1-NEXT: vzext.vf2 v10, v11 +; LMULMAX1-NEXT: vzext.vf2 v9, v8 +; LMULMAX1-NEXT: vzext.vf2 v8, v10 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v12, v11, 4 +; LMULMAX1-NEXT: vslidedown.vi v10, v12, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v11, v12 +; LMULMAX1-NEXT: vzext.vf2 v11, v10 +; LMULMAX1-NEXT: vzext.vf2 v10, v12 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v16i16_v16i32: @@ -925,39 +911,38 @@ ; LMULMAX1-NEXT: vslidedown.vi v11, v9, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-NEXT: vsext.vf4 v10, v11 +; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2 +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vsext.vf4 v11, v12 +; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; LMULMAX1-NEXT: vslidedown.vi v12, v9, 2 +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vsext.vf4 v9, v12 ; LMULMAX1-NEXT: vsext.vf4 v12, v13 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; LMULMAX1-NEXT: vslidedown.vi v15, v13, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-NEXT: vsext.vf4 v14, v15 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v16, v9, 2 -; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf4 v9, v16 -; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v16, v11, 2 +; LMULMAX1-NEXT: vslidedown.vi v16, v15, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf4 v11, v16 +; LMULMAX1-NEXT: vsext.vf4 v15, v16 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; LMULMAX1-NEXT: vslidedown.vi v16, v13, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-NEXT: vsext.vf4 v13, v16 -; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v16, v15, 2 -; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf4 v15, v16 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v16i16_v16i64: ; LMULMAX4: # %bb.0: ; LMULMAX4-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; LMULMAX4-NEXT: vle16.v v12, (a0) -; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; LMULMAX4-NEXT: vsext.vf4 v8, v12 +; LMULMAX4-NEXT: vle16.v v16, (a0) ; LMULMAX4-NEXT: vsetivli zero, 8, e16, m2, ta, ma -; LMULMAX4-NEXT: vslidedown.vi v16, v12, 8 +; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8 ; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; LMULMAX4-NEXT: vsext.vf4 v12, v16 +; LMULMAX4-NEXT: vsext.vf4 v12, v8 +; LMULMAX4-NEXT: vsext.vf4 v8, v16 ; LMULMAX4-NEXT: ret %y = load <16 x i16>, ptr %x %z = sext <16 x i16> %y to <16 x i64> @@ -977,39 +962,38 @@ ; LMULMAX1-NEXT: vslidedown.vi v11, v9, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-NEXT: vzext.vf4 v10, v11 +; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2 +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vzext.vf4 v11, v12 +; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; LMULMAX1-NEXT: vslidedown.vi v12, v9, 2 +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vzext.vf4 v9, v12 ; LMULMAX1-NEXT: vzext.vf4 v12, v13 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; LMULMAX1-NEXT: vslidedown.vi v15, v13, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-NEXT: vzext.vf4 v14, v15 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v16, v9, 2 -; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf4 v9, v16 -; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v16, v11, 2 +; LMULMAX1-NEXT: vslidedown.vi v16, v15, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf4 v11, v16 +; LMULMAX1-NEXT: vzext.vf4 v15, v16 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; LMULMAX1-NEXT: vslidedown.vi v16, v13, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-NEXT: vzext.vf4 v13, v16 -; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v16, v15, 2 -; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf4 v15, v16 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v16i16_v16i64: ; LMULMAX4: # %bb.0: ; LMULMAX4-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; LMULMAX4-NEXT: vle16.v v12, (a0) -; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; LMULMAX4-NEXT: vzext.vf4 v8, v12 +; LMULMAX4-NEXT: vle16.v v16, (a0) ; LMULMAX4-NEXT: vsetivli zero, 8, e16, m2, ta, ma -; LMULMAX4-NEXT: vslidedown.vi v16, v12, 8 +; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8 ; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; LMULMAX4-NEXT: vzext.vf4 v12, v16 +; LMULMAX4-NEXT: vzext.vf4 v12, v8 +; LMULMAX4-NEXT: vzext.vf4 v8, v16 ; LMULMAX4-NEXT: ret %y = load <16 x i16>, ptr %x %z = zext <16 x i16> %y to <16 x i64> @@ -1096,13 +1080,12 @@ ; LMULMAX1-LABEL: sextload_v4i32_v4i64: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vle32.v v9, (a0) -; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v8, v9 +; LMULMAX1-NEXT: vle32.v v10, (a0) ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2 +; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v9, v10 +; LMULMAX1-NEXT: vsext.vf2 v9, v8 +; LMULMAX1-NEXT: vsext.vf2 v8, v10 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v4i32_v4i64: @@ -1120,13 +1103,12 @@ ; LMULMAX1-LABEL: zextload_v4i32_v4i64: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vle32.v v9, (a0) -; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v8, v9 +; LMULMAX1-NEXT: vle32.v v10, (a0) ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2 +; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v9, v10 +; LMULMAX1-NEXT: vzext.vf2 v9, v8 +; LMULMAX1-NEXT: vzext.vf2 v8, v10 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v4i32_v4i64: @@ -1195,20 +1177,19 @@ ; LMULMAX1-LABEL: sextload_v8i32_v8i64: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vle32.v v9, (a0) +; LMULMAX1-NEXT: vle32.v v10, (a0) ; LMULMAX1-NEXT: addi a0, a0, 16 -; LMULMAX1-NEXT: vle32.v v11, (a0) -; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v8, v9 +; LMULMAX1-NEXT: vle32.v v12, (a0) ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2 +; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v9, v10 -; LMULMAX1-NEXT: vsext.vf2 v10, v11 +; LMULMAX1-NEXT: vsext.vf2 v9, v8 +; LMULMAX1-NEXT: vsext.vf2 v8, v10 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2 +; LMULMAX1-NEXT: vslidedown.vi v10, v12, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v11, v12 +; LMULMAX1-NEXT: vsext.vf2 v11, v10 +; LMULMAX1-NEXT: vsext.vf2 v10, v12 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v8i32_v8i64: @@ -1226,20 +1207,19 @@ ; LMULMAX1-LABEL: zextload_v8i32_v8i64: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vle32.v v9, (a0) +; LMULMAX1-NEXT: vle32.v v10, (a0) ; LMULMAX1-NEXT: addi a0, a0, 16 -; LMULMAX1-NEXT: vle32.v v11, (a0) -; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v8, v9 +; LMULMAX1-NEXT: vle32.v v12, (a0) ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2 +; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v9, v10 -; LMULMAX1-NEXT: vzext.vf2 v10, v11 +; LMULMAX1-NEXT: vzext.vf2 v9, v8 +; LMULMAX1-NEXT: vzext.vf2 v8, v10 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2 +; LMULMAX1-NEXT: vslidedown.vi v10, v12, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v11, v12 +; LMULMAX1-NEXT: vzext.vf2 v11, v10 +; LMULMAX1-NEXT: vzext.vf2 v10, v12 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v8i32_v8i64: @@ -1328,45 +1308,43 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: addi a1, a0, 48 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vle32.v v15, (a1) +; LMULMAX1-NEXT: vle32.v v16, (a1) ; LMULMAX1-NEXT: addi a1, a0, 32 -; LMULMAX1-NEXT: vle32.v v13, (a1) -; LMULMAX1-NEXT: vle32.v v9, (a0) +; LMULMAX1-NEXT: vle32.v v14, (a1) +; LMULMAX1-NEXT: vle32.v v10, (a0) ; LMULMAX1-NEXT: addi a0, a0, 16 -; LMULMAX1-NEXT: vle32.v v11, (a0) -; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v8, v9 +; LMULMAX1-NEXT: vle32.v v12, (a0) ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2 +; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v9, v10 -; LMULMAX1-NEXT: vsext.vf2 v10, v11 +; LMULMAX1-NEXT: vsext.vf2 v9, v8 +; LMULMAX1-NEXT: vsext.vf2 v8, v10 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2 +; LMULMAX1-NEXT: vslidedown.vi v10, v12, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v11, v12 -; LMULMAX1-NEXT: vsext.vf2 v12, v13 +; LMULMAX1-NEXT: vsext.vf2 v11, v10 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v14, v13, 2 +; LMULMAX1-NEXT: vslidedown.vi v10, v14, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v13, v14 -; LMULMAX1-NEXT: vsext.vf2 v14, v15 +; LMULMAX1-NEXT: vsext.vf2 v13, v10 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v16, v15, 2 +; LMULMAX1-NEXT: vslidedown.vi v10, v16, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v15, v16 +; LMULMAX1-NEXT: vsext.vf2 v15, v10 +; LMULMAX1-NEXT: vsext.vf2 v10, v12 +; LMULMAX1-NEXT: vsext.vf2 v12, v14 +; LMULMAX1-NEXT: vsext.vf2 v14, v16 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: sextload_v16i32_v16i64: ; LMULMAX4: # %bb.0: ; LMULMAX4-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; LMULMAX4-NEXT: vle32.v v12, (a0) -; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; LMULMAX4-NEXT: vsext.vf2 v8, v12 +; LMULMAX4-NEXT: vle32.v v16, (a0) ; LMULMAX4-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; LMULMAX4-NEXT: vslidedown.vi v16, v12, 8 +; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8 ; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; LMULMAX4-NEXT: vsext.vf2 v12, v16 +; LMULMAX4-NEXT: vsext.vf2 v12, v8 +; LMULMAX4-NEXT: vsext.vf2 v8, v16 ; LMULMAX4-NEXT: ret %y = load <16 x i32>, ptr %x %z = sext <16 x i32> %y to <16 x i64> @@ -1378,45 +1356,43 @@ ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: addi a1, a0, 48 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vle32.v v15, (a1) +; LMULMAX1-NEXT: vle32.v v16, (a1) ; LMULMAX1-NEXT: addi a1, a0, 32 -; LMULMAX1-NEXT: vle32.v v13, (a1) -; LMULMAX1-NEXT: vle32.v v9, (a0) +; LMULMAX1-NEXT: vle32.v v14, (a1) +; LMULMAX1-NEXT: vle32.v v10, (a0) ; LMULMAX1-NEXT: addi a0, a0, 16 -; LMULMAX1-NEXT: vle32.v v11, (a0) -; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v8, v9 +; LMULMAX1-NEXT: vle32.v v12, (a0) ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2 +; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v9, v10 -; LMULMAX1-NEXT: vzext.vf2 v10, v11 +; LMULMAX1-NEXT: vzext.vf2 v9, v8 +; LMULMAX1-NEXT: vzext.vf2 v8, v10 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2 +; LMULMAX1-NEXT: vslidedown.vi v10, v12, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v11, v12 -; LMULMAX1-NEXT: vzext.vf2 v12, v13 +; LMULMAX1-NEXT: vzext.vf2 v11, v10 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v14, v13, 2 +; LMULMAX1-NEXT: vslidedown.vi v10, v14, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v13, v14 -; LMULMAX1-NEXT: vzext.vf2 v14, v15 +; LMULMAX1-NEXT: vzext.vf2 v13, v10 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v16, v15, 2 +; LMULMAX1-NEXT: vslidedown.vi v10, v16, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v15, v16 +; LMULMAX1-NEXT: vzext.vf2 v15, v10 +; LMULMAX1-NEXT: vzext.vf2 v10, v12 +; LMULMAX1-NEXT: vzext.vf2 v12, v14 +; LMULMAX1-NEXT: vzext.vf2 v14, v16 ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: zextload_v16i32_v16i64: ; LMULMAX4: # %bb.0: ; LMULMAX4-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; LMULMAX4-NEXT: vle32.v v12, (a0) -; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; LMULMAX4-NEXT: vzext.vf2 v8, v12 +; LMULMAX4-NEXT: vle32.v v16, (a0) ; LMULMAX4-NEXT: vsetivli zero, 8, e32, m4, ta, ma -; LMULMAX4-NEXT: vslidedown.vi v16, v12, 8 +; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8 ; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; LMULMAX4-NEXT: vzext.vf2 v12, v16 +; LMULMAX4-NEXT: vzext.vf2 v12, v8 +; LMULMAX4-NEXT: vzext.vf2 v8, v16 ; LMULMAX4-NEXT: ret %y = load <16 x i32>, ptr %x %z = zext <16 x i32> %y to <16 x i64> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll @@ -84,27 +84,27 @@ ; LMULMAX1-NEXT: vfwcvt.f.f.v v10, v9 ; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; LMULMAX1-NEXT: vfwcvt.f.f.v v9, v10 +; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; LMULMAX1-NEXT: vfwcvt.f.f.v v10, v8 +; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; LMULMAX1-NEXT: vfwcvt.f.f.v v11, v10 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v8, 4 -; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v11, v10, 2 +; LMULMAX1-NEXT: vslidedown.vi v8, v8, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; LMULMAX1-NEXT: vfwcvt.f.f.v v12, v11 -; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; LMULMAX1-NEXT: vfwcvt.f.f.v v11, v12 -; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; LMULMAX1-NEXT: vfwcvt.f.f.v v12, v8 +; LMULMAX1-NEXT: vfwcvt.f.f.v v10, v8 ; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; LMULMAX1-NEXT: vfwcvt.f.f.v v8, v12 -; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; LMULMAX1-NEXT: vfwcvt.f.f.v v12, v10 +; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2 +; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; LMULMAX1-NEXT: vfwcvt.f.f.v v10, v8 ; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; LMULMAX1-NEXT: vfwcvt.f.f.v v10, v12 -; LMULMAX1-NEXT: addi a0, a1, 32 -; LMULMAX1-NEXT: vse64.v v10, (a0) -; LMULMAX1-NEXT: vse64.v v8, (a1) +; LMULMAX1-NEXT: vfwcvt.f.f.v v8, v10 ; LMULMAX1-NEXT: addi a0, a1, 48 -; LMULMAX1-NEXT: vse64.v v11, (a0) +; LMULMAX1-NEXT: vse64.v v8, (a0) +; LMULMAX1-NEXT: addi a0, a1, 32 +; LMULMAX1-NEXT: vse64.v v12, (a0) +; LMULMAX1-NEXT: vse64.v v11, (a1) ; LMULMAX1-NEXT: addi a1, a1, 16 ; LMULMAX1-NEXT: vse64.v v9, (a1) ; LMULMAX1-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll @@ -493,20 +493,20 @@ ; LMULMAX1-NEXT: vle32.v v9, (a0) ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v10, v8 +; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v11, v9 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v11, v8 -; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v8, v9 +; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v12, v8 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v9, v9, 2 +; LMULMAX1-NEXT: vslidedown.vi v8, v9, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v12, v9 +; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v9, v8 ; LMULMAX1-NEXT: addi a0, a1, 16 -; LMULMAX1-NEXT: vse64.v v12, (a0) -; LMULMAX1-NEXT: vse64.v v8, (a1) +; LMULMAX1-NEXT: vse64.v v9, (a0) ; LMULMAX1-NEXT: addi a0, a1, 48 -; LMULMAX1-NEXT: vse64.v v11, (a0) +; LMULMAX1-NEXT: vse64.v v12, (a0) +; LMULMAX1-NEXT: vse64.v v11, (a1) ; LMULMAX1-NEXT: addi a0, a1, 32 ; LMULMAX1-NEXT: vse64.v v10, (a0) ; LMULMAX1-NEXT: ret @@ -533,20 +533,20 @@ ; LMULMAX1-NEXT: vle32.v v9, (a0) ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v10, v8 +; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v11, v9 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v11, v8 -; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v8, v9 +; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v12, v8 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v9, v9, 2 +; LMULMAX1-NEXT: vslidedown.vi v8, v9, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v12, v9 +; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v9, v8 ; LMULMAX1-NEXT: addi a0, a1, 16 -; LMULMAX1-NEXT: vse64.v v12, (a0) -; LMULMAX1-NEXT: vse64.v v8, (a1) +; LMULMAX1-NEXT: vse64.v v9, (a0) ; LMULMAX1-NEXT: addi a0, a1, 48 -; LMULMAX1-NEXT: vse64.v v11, (a0) +; LMULMAX1-NEXT: vse64.v v12, (a0) +; LMULMAX1-NEXT: vse64.v v11, (a1) ; LMULMAX1-NEXT: addi a0, a1, 32 ; LMULMAX1-NEXT: vse64.v v10, (a0) ; LMULMAX1-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll @@ -449,22 +449,23 @@ ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; LMULMAX1-NEXT: vsext.vf2 v10, v9 ; LMULMAX1-NEXT: vfwcvt.f.x.v v9, v10 +; LMULMAX1-NEXT: vsext.vf2 v10, v8 +; LMULMAX1-NEXT: vfwcvt.f.x.v v11, v10 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v8, 4 +; LMULMAX1-NEXT: vslidedown.vi v8, v8, 4 +; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; LMULMAX1-NEXT: vsext.vf2 v10, v8 +; LMULMAX1-NEXT: vfwcvt.f.x.v v12, v10 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v11, v10, 2 +; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; LMULMAX1-NEXT: vsext.vf2 v12, v11 -; LMULMAX1-NEXT: vfwcvt.f.x.v v11, v12 -; LMULMAX1-NEXT: vsext.vf2 v12, v8 -; LMULMAX1-NEXT: vfwcvt.f.x.v v8, v12 -; LMULMAX1-NEXT: vsext.vf2 v12, v10 -; LMULMAX1-NEXT: vfwcvt.f.x.v v10, v12 -; LMULMAX1-NEXT: addi a0, a1, 32 -; LMULMAX1-NEXT: vse64.v v10, (a0) -; LMULMAX1-NEXT: vse64.v v8, (a1) +; LMULMAX1-NEXT: vsext.vf2 v10, v8 +; LMULMAX1-NEXT: vfwcvt.f.x.v v8, v10 ; LMULMAX1-NEXT: addi a0, a1, 48 -; LMULMAX1-NEXT: vse64.v v11, (a0) +; LMULMAX1-NEXT: vse64.v v8, (a0) +; LMULMAX1-NEXT: addi a0, a1, 32 +; LMULMAX1-NEXT: vse64.v v12, (a0) +; LMULMAX1-NEXT: vse64.v v11, (a1) ; LMULMAX1-NEXT: addi a1, a1, 16 ; LMULMAX1-NEXT: vse64.v v9, (a1) ; LMULMAX1-NEXT: ret @@ -493,22 +494,23 @@ ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; LMULMAX1-NEXT: vzext.vf2 v10, v9 ; LMULMAX1-NEXT: vfwcvt.f.xu.v v9, v10 +; LMULMAX1-NEXT: vzext.vf2 v10, v8 +; LMULMAX1-NEXT: vfwcvt.f.xu.v v11, v10 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v8, 4 +; LMULMAX1-NEXT: vslidedown.vi v8, v8, 4 +; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; LMULMAX1-NEXT: vzext.vf2 v10, v8 +; LMULMAX1-NEXT: vfwcvt.f.xu.v v12, v10 ; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v11, v10, 2 +; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; LMULMAX1-NEXT: vzext.vf2 v12, v11 -; LMULMAX1-NEXT: vfwcvt.f.xu.v v11, v12 -; LMULMAX1-NEXT: vzext.vf2 v12, v8 -; LMULMAX1-NEXT: vfwcvt.f.xu.v v8, v12 -; LMULMAX1-NEXT: vzext.vf2 v12, v10 -; LMULMAX1-NEXT: vfwcvt.f.xu.v v10, v12 -; LMULMAX1-NEXT: addi a0, a1, 32 -; LMULMAX1-NEXT: vse64.v v10, (a0) -; LMULMAX1-NEXT: vse64.v v8, (a1) +; LMULMAX1-NEXT: vzext.vf2 v10, v8 +; LMULMAX1-NEXT: vfwcvt.f.xu.v v8, v10 ; LMULMAX1-NEXT: addi a0, a1, 48 -; LMULMAX1-NEXT: vse64.v v11, (a0) +; LMULMAX1-NEXT: vse64.v v8, (a0) +; LMULMAX1-NEXT: addi a0, a1, 32 +; LMULMAX1-NEXT: vse64.v v12, (a0) +; LMULMAX1-NEXT: vse64.v v11, (a1) ; LMULMAX1-NEXT: addi a1, a1, 16 ; LMULMAX1-NEXT: vse64.v v9, (a1) ; LMULMAX1-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-exttrunc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-exttrunc.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-exttrunc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-exttrunc.ll @@ -90,19 +90,20 @@ ; LMULMAX2-NEXT: vslidedown.vi v10, v8, 8 ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; LMULMAX2-NEXT: vsext.vf4 v12, v10 +; LMULMAX2-NEXT: vsext.vf4 v10, v8 ; LMULMAX2-NEXT: vsetivli zero, 16, e8, m2, ta, ma -; LMULMAX2-NEXT: vslidedown.vi v10, v8, 16 +; LMULMAX2-NEXT: vslidedown.vi v8, v8, 16 +; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-NEXT: vsext.vf4 v14, v8 ; LMULMAX2-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; LMULMAX2-NEXT: vslidedown.vi v9, v10, 8 +; LMULMAX2-NEXT: vslidedown.vi v8, v8, 8 ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-NEXT: vsext.vf4 v14, v9 ; LMULMAX2-NEXT: vsext.vf4 v16, v8 -; LMULMAX2-NEXT: vsext.vf4 v8, v10 -; LMULMAX2-NEXT: addi a0, a1, 64 -; LMULMAX2-NEXT: vse32.v v8, (a0) -; LMULMAX2-NEXT: vse32.v v16, (a1) ; LMULMAX2-NEXT: addi a0, a1, 96 +; LMULMAX2-NEXT: vse32.v v16, (a0) +; LMULMAX2-NEXT: addi a0, a1, 64 ; LMULMAX2-NEXT: vse32.v v14, (a0) +; LMULMAX2-NEXT: vse32.v v10, (a1) ; LMULMAX2-NEXT: addi a0, a1, 32 ; LMULMAX2-NEXT: vse32.v v12, (a0) ; LMULMAX2-NEXT: ret @@ -117,39 +118,41 @@ ; LMULMAX1-NEXT: vslidedown.vi v10, v8, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; LMULMAX1-NEXT: vsext.vf4 v11, v10 -; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v10, v8, 8 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v12, v10, 4 +; LMULMAX1-NEXT: vslidedown.vi v10, v9, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf4 v13, v12 +; LMULMAX1-NEXT: vsext.vf4 v12, v10 +; LMULMAX1-NEXT: vsext.vf4 v10, v8 +; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; LMULMAX1-NEXT: vslidedown.vi v8, v8, 8 +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-NEXT: vsext.vf4 v13, v8 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v12, v9, 4 +; LMULMAX1-NEXT: vslidedown.vi v8, v8, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf4 v14, v12 +; LMULMAX1-NEXT: vsext.vf4 v14, v8 +; LMULMAX1-NEXT: vsext.vf4 v8, v9 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v12, v9, 8 +; LMULMAX1-NEXT: vslidedown.vi v9, v9, 8 +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-NEXT: vsext.vf4 v15, v9 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; LMULMAX1-NEXT: vslidedown.vi v15, v12, 4 +; LMULMAX1-NEXT: vslidedown.vi v9, v9, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vsext.vf4 v16, v15 -; LMULMAX1-NEXT: vsext.vf4 v15, v8 -; LMULMAX1-NEXT: vsext.vf4 v8, v10 -; LMULMAX1-NEXT: vsext.vf4 v10, v9 -; LMULMAX1-NEXT: vsext.vf4 v9, v12 -; LMULMAX1-NEXT: addi a0, a1, 32 -; LMULMAX1-NEXT: vse32.v v9, (a0) -; LMULMAX1-NEXT: vse32.v v10, (a1) -; LMULMAX1-NEXT: addi a0, a1, 96 -; LMULMAX1-NEXT: vse32.v v8, (a0) -; LMULMAX1-NEXT: addi a0, a1, 64 -; LMULMAX1-NEXT: vse32.v v15, (a0) +; LMULMAX1-NEXT: vsext.vf4 v16, v9 ; LMULMAX1-NEXT: addi a0, a1, 48 ; LMULMAX1-NEXT: vse32.v v16, (a0) -; LMULMAX1-NEXT: addi a0, a1, 16 -; LMULMAX1-NEXT: vse32.v v14, (a0) +; LMULMAX1-NEXT: addi a0, a1, 32 +; LMULMAX1-NEXT: vse32.v v15, (a0) +; LMULMAX1-NEXT: vse32.v v8, (a1) ; LMULMAX1-NEXT: addi a0, a1, 112 +; LMULMAX1-NEXT: vse32.v v14, (a0) +; LMULMAX1-NEXT: addi a0, a1, 96 ; LMULMAX1-NEXT: vse32.v v13, (a0) +; LMULMAX1-NEXT: addi a0, a1, 64 +; LMULMAX1-NEXT: vse32.v v10, (a0) +; LMULMAX1-NEXT: addi a0, a1, 16 +; LMULMAX1-NEXT: vse32.v v12, (a0) ; LMULMAX1-NEXT: addi a0, a1, 80 ; LMULMAX1-NEXT: vse32.v v11, (a0) ; LMULMAX1-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll @@ -780,19 +780,19 @@ ; CHECK-LABEL: sdiv_v6i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vdiv.vv v10, v8, v9 +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v9, (a0) ; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v9, 4 -; CHECK-NEXT: vslidedown.vi v8, v8, 4 +; CHECK-NEXT: vslidedown.vi v10, v8, 4 +; CHECK-NEXT: vslidedown.vi v11, v9, 4 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vdiv.vv v8, v8, v9 +; CHECK-NEXT: vdiv.vv v10, v11, v10 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vdiv.vv v8, v9, v8 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vi v10, v8, 4 +; CHECK-NEXT: vslideup.vi v8, v10, 4 ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x %b = load <6 x i16>, ptr %y @@ -869,19 +869,19 @@ ; CHECK-LABEL: srem_v6i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vrem.vv v10, v8, v9 +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v9, (a0) ; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v9, 4 -; CHECK-NEXT: vslidedown.vi v8, v8, 4 +; CHECK-NEXT: vslidedown.vi v10, v8, 4 +; CHECK-NEXT: vslidedown.vi v11, v9, 4 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vrem.vv v8, v8, v9 +; CHECK-NEXT: vrem.vv v10, v11, v10 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vrem.vv v8, v9, v8 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vi v10, v8, 4 +; CHECK-NEXT: vslideup.vi v8, v10, 4 ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x %b = load <6 x i16>, ptr %y @@ -958,19 +958,19 @@ ; CHECK-LABEL: udiv_v6i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vdivu.vv v10, v8, v9 +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v9, (a0) ; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v9, 4 -; CHECK-NEXT: vslidedown.vi v8, v8, 4 +; CHECK-NEXT: vslidedown.vi v10, v8, 4 +; CHECK-NEXT: vslidedown.vi v11, v9, 4 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vdivu.vv v8, v8, v9 +; CHECK-NEXT: vdivu.vv v10, v11, v10 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vdivu.vv v8, v9, v8 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vi v10, v8, 4 +; CHECK-NEXT: vslideup.vi v8, v10, 4 ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x %b = load <6 x i16>, ptr %y @@ -1047,19 +1047,19 @@ ; CHECK-LABEL: urem_v6i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vremu.vv v10, v8, v9 +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle16.v v9, (a0) ; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v9, 4 -; CHECK-NEXT: vslidedown.vi v8, v8, 4 +; CHECK-NEXT: vslidedown.vi v10, v8, 4 +; CHECK-NEXT: vslidedown.vi v11, v9, 4 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vremu.vv v8, v8, v9 +; CHECK-NEXT: vremu.vv v10, v11, v10 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vremu.vv v8, v9, v8 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vi v10, v8, 4 +; CHECK-NEXT: vslideup.vi v8, v10, 4 ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vse16.v v10, (a0) +; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x %b = load <6 x i16>, ptr %y @@ -1244,22 +1244,22 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: lui a1, %hi(.LCPI67_0) -; CHECK-NEXT: addi a1, a1, %lo(.LCPI67_0) -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vdivu.vv v9, v8, v9 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vadd.vi v10, v10, 12 +; CHECK-NEXT: vid.v v9 +; CHECK-NEXT: vadd.vi v9, v9, 12 ; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v8, 4 +; CHECK-NEXT: vslidedown.vi v10, v8, 4 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vdivu.vv v9, v10, v9 +; CHECK-NEXT: lui a1, %hi(.LCPI67_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI67_0) +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v10, (a1) ; CHECK-NEXT: vdivu.vv v8, v8, v10 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vi v9, v8, 4 +; CHECK-NEXT: vslideup.vi v8, v9, 4 ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vse16.v v9, (a0) +; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x %b = udiv <6 x i16> %a, diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -743,69 +743,71 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB12_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB12_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, mf2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 -; RV64ZVE32F-NEXT: .LBB12_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB12_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB12_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB12_6: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB12_14 -; RV64ZVE32F-NEXT: .LBB12_8: # %else11 +; RV64ZVE32F-NEXT: .LBB12_7: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB12_10 -; RV64ZVE32F-NEXT: .LBB12_9: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB12_9 +; RV64ZVE32F-NEXT: .LBB12_8: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e8, mf2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 -; RV64ZVE32F-NEXT: .LBB12_10: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5 +; RV64ZVE32F-NEXT: .LBB12_9: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB12_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.10: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB12_16 -; RV64ZVE32F-NEXT: .LBB12_12: # %else20 +; RV64ZVE32F-NEXT: .LBB12_11: # %else20 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB12_12: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, mf2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB12_6 ; RV64ZVE32F-NEXT: .LBB12_13: # %cond.load7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB12_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB12_7 ; RV64ZVE32F-NEXT: .LBB12_14: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, mf2, tu, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB12_9 -; RV64ZVE32F-NEXT: j .LBB12_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB12_8 +; RV64ZVE32F-NEXT: j .LBB12_9 ; RV64ZVE32F-NEXT: .LBB12_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -814,7 +816,7 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 7, e8, mf2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB12_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB12_11 ; RV64ZVE32F-NEXT: .LBB12_16: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -1468,74 +1470,76 @@ ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB23_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB23_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 -; RV64ZVE32F-NEXT: .LBB23_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB23_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB23_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB23_6: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB23_14 -; RV64ZVE32F-NEXT: .LBB23_8: # %else11 +; RV64ZVE32F-NEXT: .LBB23_7: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB23_10 -; RV64ZVE32F-NEXT: .LBB23_9: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB23_9 +; RV64ZVE32F-NEXT: .LBB23_8: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 -; RV64ZVE32F-NEXT: .LBB23_10: # %else14 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5 +; RV64ZVE32F-NEXT: .LBB23_9: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB23_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.10: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB23_16 -; RV64ZVE32F-NEXT: .LBB23_12: # %else20 +; RV64ZVE32F-NEXT: .LBB23_11: # %else20 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB23_12: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB23_6 ; RV64ZVE32F-NEXT: .LBB23_13: # %cond.load7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB23_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB23_7 ; RV64ZVE32F-NEXT: .LBB23_14: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB23_9 -; RV64ZVE32F-NEXT: j .LBB23_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB23_8 +; RV64ZVE32F-NEXT: j .LBB23_9 ; RV64ZVE32F-NEXT: .LBB23_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 @@ -1545,7 +1549,7 @@ ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB23_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB23_11 ; RV64ZVE32F-NEXT: .LBB23_16: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -1611,74 +1615,76 @@ ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB24_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB24_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 -; RV64ZVE32F-NEXT: .LBB24_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB24_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB24_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB24_6: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB24_14 -; RV64ZVE32F-NEXT: .LBB24_8: # %else11 +; RV64ZVE32F-NEXT: .LBB24_7: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB24_10 -; RV64ZVE32F-NEXT: .LBB24_9: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB24_9 +; RV64ZVE32F-NEXT: .LBB24_8: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 -; RV64ZVE32F-NEXT: .LBB24_10: # %else14 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5 +; RV64ZVE32F-NEXT: .LBB24_9: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB24_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.10: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB24_16 -; RV64ZVE32F-NEXT: .LBB24_12: # %else20 +; RV64ZVE32F-NEXT: .LBB24_11: # %else20 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB24_12: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB24_6 ; RV64ZVE32F-NEXT: .LBB24_13: # %cond.load7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB24_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB24_7 ; RV64ZVE32F-NEXT: .LBB24_14: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB24_9 -; RV64ZVE32F-NEXT: j .LBB24_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB24_8 +; RV64ZVE32F-NEXT: j .LBB24_9 ; RV64ZVE32F-NEXT: .LBB24_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 @@ -1688,7 +1694,7 @@ ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB24_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB24_11 ; RV64ZVE32F-NEXT: .LBB24_16: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -1757,78 +1763,80 @@ ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB25_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB25_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: andi a2, a2, 255 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 -; RV64ZVE32F-NEXT: .LBB25_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB25_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB25_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB25_6: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB25_14 -; RV64ZVE32F-NEXT: .LBB25_8: # %else11 +; RV64ZVE32F-NEXT: .LBB25_7: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB25_10 -; RV64ZVE32F-NEXT: .LBB25_9: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB25_9 +; RV64ZVE32F-NEXT: .LBB25_8: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 -; RV64ZVE32F-NEXT: .LBB25_10: # %else14 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5 +; RV64ZVE32F-NEXT: .LBB25_9: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB25_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.10: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB25_16 -; RV64ZVE32F-NEXT: .LBB25_12: # %else20 +; RV64ZVE32F-NEXT: .LBB25_11: # %else20 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB25_12: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB25_6 ; RV64ZVE32F-NEXT: .LBB25_13: # %cond.load7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB25_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB25_7 ; RV64ZVE32F-NEXT: .LBB25_14: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB25_9 -; RV64ZVE32F-NEXT: j .LBB25_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB25_8 +; RV64ZVE32F-NEXT: j .LBB25_9 ; RV64ZVE32F-NEXT: .LBB25_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 @@ -1839,7 +1847,7 @@ ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB25_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB25_11 ; RV64ZVE32F-NEXT: .LBB25_16: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -1906,73 +1914,75 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB26_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB26_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 -; RV64ZVE32F-NEXT: .LBB26_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB26_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB26_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB26_6: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB26_14 -; RV64ZVE32F-NEXT: .LBB26_8: # %else11 +; RV64ZVE32F-NEXT: .LBB26_7: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB26_10 -; RV64ZVE32F-NEXT: .LBB26_9: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB26_9 +; RV64ZVE32F-NEXT: .LBB26_8: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 -; RV64ZVE32F-NEXT: .LBB26_10: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5 +; RV64ZVE32F-NEXT: .LBB26_9: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB26_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.10: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB26_16 -; RV64ZVE32F-NEXT: .LBB26_12: # %else20 +; RV64ZVE32F-NEXT: .LBB26_11: # %else20 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB26_12: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lh a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB26_6 ; RV64ZVE32F-NEXT: .LBB26_13: # %cond.load7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB26_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB26_7 ; RV64ZVE32F-NEXT: .LBB26_14: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lh a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB26_9 -; RV64ZVE32F-NEXT: j .LBB26_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB26_8 +; RV64ZVE32F-NEXT: j .LBB26_9 ; RV64ZVE32F-NEXT: .LBB26_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 @@ -1982,7 +1992,7 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB26_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB26_11 ; RV64ZVE32F-NEXT: .LBB26_16: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -2528,56 +2538,58 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 ; RV64ZVE32F-NEXT: .LBB35_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB35_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 -; RV64ZVE32F-NEXT: .LBB35_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB35_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB35_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB35_6: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB35_14 -; RV64ZVE32F-NEXT: .LBB35_8: # %else11 +; RV64ZVE32F-NEXT: .LBB35_7: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB35_10 -; RV64ZVE32F-NEXT: .LBB35_9: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB35_9 +; RV64ZVE32F-NEXT: .LBB35_8: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB35_10: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5 +; RV64ZVE32F-NEXT: .LBB35_9: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB35_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.10: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB35_16 -; RV64ZVE32F-NEXT: .LBB35_12: # %else20 +; RV64ZVE32F-NEXT: .LBB35_11: # %else20 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB35_12: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB35_6 ; RV64ZVE32F-NEXT: .LBB35_13: # %cond.load7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) @@ -2586,19 +2598,19 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB35_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB35_7 ; RV64ZVE32F-NEXT: .LBB35_14: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB35_9 -; RV64ZVE32F-NEXT: j .LBB35_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB35_8 +; RV64ZVE32F-NEXT: j .LBB35_9 ; RV64ZVE32F-NEXT: .LBB35_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -2609,7 +2621,7 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB35_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB35_11 ; RV64ZVE32F-NEXT: .LBB35_16: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -2676,56 +2688,58 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 ; RV64ZVE32F-NEXT: .LBB36_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB36_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 -; RV64ZVE32F-NEXT: .LBB36_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB36_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB36_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB36_6: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB36_14 -; RV64ZVE32F-NEXT: .LBB36_8: # %else11 +; RV64ZVE32F-NEXT: .LBB36_7: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB36_10 -; RV64ZVE32F-NEXT: .LBB36_9: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB36_9 +; RV64ZVE32F-NEXT: .LBB36_8: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB36_10: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5 +; RV64ZVE32F-NEXT: .LBB36_9: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB36_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.10: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB36_16 -; RV64ZVE32F-NEXT: .LBB36_12: # %else20 +; RV64ZVE32F-NEXT: .LBB36_11: # %else20 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB36_12: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB36_6 ; RV64ZVE32F-NEXT: .LBB36_13: # %cond.load7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) @@ -2734,19 +2748,19 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB36_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB36_7 ; RV64ZVE32F-NEXT: .LBB36_14: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB36_9 -; RV64ZVE32F-NEXT: j .LBB36_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB36_8 +; RV64ZVE32F-NEXT: j .LBB36_9 ; RV64ZVE32F-NEXT: .LBB36_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -2757,7 +2771,7 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB36_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB36_11 ; RV64ZVE32F-NEXT: .LBB36_16: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -2827,81 +2841,83 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 ; RV64ZVE32F-NEXT: .LBB37_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB37_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: andi a2, a2, 255 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 -; RV64ZVE32F-NEXT: .LBB37_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB37_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB37_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB37_6: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB37_14 -; RV64ZVE32F-NEXT: .LBB37_8: # %else11 +; RV64ZVE32F-NEXT: .LBB37_7: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB37_10 -; RV64ZVE32F-NEXT: .LBB37_9: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB37_9 +; RV64ZVE32F-NEXT: .LBB37_8: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB37_10: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5 +; RV64ZVE32F-NEXT: .LBB37_9: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB37_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.10: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB37_16 -; RV64ZVE32F-NEXT: .LBB37_12: # %else20 +; RV64ZVE32F-NEXT: .LBB37_11: # %else20 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB37_12: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB37_6 ; RV64ZVE32F-NEXT: .LBB37_13: # %cond.load7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB37_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB37_7 ; RV64ZVE32F-NEXT: .LBB37_14: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB37_9 -; RV64ZVE32F-NEXT: j .LBB37_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB37_8 +; RV64ZVE32F-NEXT: j .LBB37_9 ; RV64ZVE32F-NEXT: .LBB37_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 @@ -2913,7 +2929,7 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB37_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB37_11 ; RV64ZVE32F-NEXT: .LBB37_16: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -2983,56 +2999,58 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 ; RV64ZVE32F-NEXT: .LBB38_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB38_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 -; RV64ZVE32F-NEXT: .LBB38_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB38_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB38_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB38_6: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB38_14 -; RV64ZVE32F-NEXT: .LBB38_8: # %else11 +; RV64ZVE32F-NEXT: .LBB38_7: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB38_10 -; RV64ZVE32F-NEXT: .LBB38_9: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB38_9 +; RV64ZVE32F-NEXT: .LBB38_8: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB38_10: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5 +; RV64ZVE32F-NEXT: .LBB38_9: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB38_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.10: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB38_16 -; RV64ZVE32F-NEXT: .LBB38_12: # %else20 +; RV64ZVE32F-NEXT: .LBB38_11: # %else20 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB38_12: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB38_6 ; RV64ZVE32F-NEXT: .LBB38_13: # %cond.load7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) @@ -3041,19 +3059,19 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB38_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB38_7 ; RV64ZVE32F-NEXT: .LBB38_14: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB38_9 -; RV64ZVE32F-NEXT: j .LBB38_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB38_8 +; RV64ZVE32F-NEXT: j .LBB38_9 ; RV64ZVE32F-NEXT: .LBB38_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -3064,7 +3082,7 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB38_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB38_11 ; RV64ZVE32F-NEXT: .LBB38_16: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -3132,56 +3150,58 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 ; RV64ZVE32F-NEXT: .LBB39_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB39_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lw a2, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 -; RV64ZVE32F-NEXT: .LBB39_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB39_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB39_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB39_6: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB39_14 -; RV64ZVE32F-NEXT: .LBB39_8: # %else11 +; RV64ZVE32F-NEXT: .LBB39_7: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB39_10 -; RV64ZVE32F-NEXT: .LBB39_9: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB39_9 +; RV64ZVE32F-NEXT: .LBB39_8: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB39_10: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5 +; RV64ZVE32F-NEXT: .LBB39_9: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB39_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.10: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB39_16 -; RV64ZVE32F-NEXT: .LBB39_12: # %else20 +; RV64ZVE32F-NEXT: .LBB39_11: # %else20 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB39_12: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lw a2, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB39_6 ; RV64ZVE32F-NEXT: .LBB39_13: # %cond.load7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) @@ -3190,19 +3210,19 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB39_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB39_7 ; RV64ZVE32F-NEXT: .LBB39_14: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lw a2, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB39_9 -; RV64ZVE32F-NEXT: j .LBB39_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB39_8 +; RV64ZVE32F-NEXT: j .LBB39_9 ; RV64ZVE32F-NEXT: .LBB39_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -3213,7 +3233,7 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB39_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB39_11 ; RV64ZVE32F-NEXT: .LBB39_16: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -3286,58 +3306,60 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 ; RV64ZVE32F-NEXT: .LBB40_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a3, a2, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB40_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-NEXT: and a3, a3, a1 -; RV64ZVE32F-NEXT: slli a3, a3, 2 -; RV64ZVE32F-NEXT: add a3, a0, a3 -; RV64ZVE32F-NEXT: lw a3, 0(a3) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a3 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 -; RV64ZVE32F-NEXT: .LBB40_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB40_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a3, .LBB40_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB40_6: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 16 ; RV64ZVE32F-NEXT: bnez a3, .LBB40_14 -; RV64ZVE32F-NEXT: .LBB40_8: # %else11 +; RV64ZVE32F-NEXT: .LBB40_7: # %else11 ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: beqz a3, .LBB40_10 -; RV64ZVE32F-NEXT: .LBB40_9: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a3, .LBB40_9 +; RV64ZVE32F-NEXT: .LBB40_8: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: lw a3, 0(a3) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a3 +; RV64ZVE32F-NEXT: vmv.s.x v8, a3 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB40_10: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5 +; RV64ZVE32F-NEXT: .LBB40_9: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a3, .LBB40_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.10: # %else17 ; RV64ZVE32F-NEXT: andi a2, a2, -128 ; RV64ZVE32F-NEXT: bnez a2, .LBB40_16 -; RV64ZVE32F-NEXT: .LBB40_12: # %else20 +; RV64ZVE32F-NEXT: .LBB40_11: # %else20 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB40_12: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: lw a3, 0(a3) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v12, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: beqz a3, .LBB40_6 ; RV64ZVE32F-NEXT: .LBB40_13: # %cond.load7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 @@ -3347,20 +3369,20 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB40_8 +; RV64ZVE32F-NEXT: beqz a3, .LBB40_7 ; RV64ZVE32F-NEXT: .LBB40_14: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: lw a3, 0(a3) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a3 -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: vmv.s.x v8, a3 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB40_9 -; RV64ZVE32F-NEXT: j .LBB40_10 +; RV64ZVE32F-NEXT: bnez a3, .LBB40_8 +; RV64ZVE32F-NEXT: j .LBB40_9 ; RV64ZVE32F-NEXT: .LBB40_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: and a3, a3, a1 @@ -3372,7 +3394,7 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB40_12 +; RV64ZVE32F-NEXT: beqz a2, .LBB40_11 ; RV64ZVE32F-NEXT: .LBB40_16: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -4361,22 +4383,22 @@ ; RV64ZVE32F-LABEL: mgather_baseidx_v8i8_v8i64: ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a6, v0 -; RV64ZVE32F-NEXT: andi a3, a6, 1 +; RV64ZVE32F-NEXT: vmv.x.s a5, v0 +; RV64ZVE32F-NEXT: andi a3, a5, 1 ; RV64ZVE32F-NEXT: beqz a3, .LBB48_3 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: ld a3, 0(a3) -; RV64ZVE32F-NEXT: andi a4, a6, 2 +; RV64ZVE32F-NEXT: andi a4, a5, 2 ; RV64ZVE32F-NEXT: bnez a4, .LBB48_4 ; RV64ZVE32F-NEXT: .LBB48_2: ; RV64ZVE32F-NEXT: ld a4, 8(a2) ; RV64ZVE32F-NEXT: j .LBB48_5 ; RV64ZVE32F-NEXT: .LBB48_3: ; RV64ZVE32F-NEXT: ld a3, 0(a2) -; RV64ZVE32F-NEXT: andi a4, a6, 2 +; RV64ZVE32F-NEXT: andi a4, a5, 2 ; RV64ZVE32F-NEXT: beqz a4, .LBB48_2 ; RV64ZVE32F-NEXT: .LBB48_4: # %cond.load1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma @@ -4386,87 +4408,83 @@ ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) ; RV64ZVE32F-NEXT: .LBB48_5: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: andi a5, a6, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a5, .LBB48_7 +; RV64ZVE32F-NEXT: andi a6, a5, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB48_10 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a5, v9 -; RV64ZVE32F-NEXT: slli a5, a5, 3 -; RV64ZVE32F-NEXT: add a5, a1, a5 -; RV64ZVE32F-NEXT: ld a5, 0(a5) -; RV64ZVE32F-NEXT: j .LBB48_8 +; RV64ZVE32F-NEXT: vmv.x.s a6, v8 +; RV64ZVE32F-NEXT: slli a6, a6, 3 +; RV64ZVE32F-NEXT: add a6, a1, a6 +; RV64ZVE32F-NEXT: ld a6, 0(a6) +; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: bnez a7, .LBB48_11 ; RV64ZVE32F-NEXT: .LBB48_7: -; RV64ZVE32F-NEXT: ld a5, 16(a2) -; RV64ZVE32F-NEXT: .LBB48_8: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: andi a7, a6, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: beqz a7, .LBB48_12 -; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a7, v9 +; RV64ZVE32F-NEXT: ld a7, 24(a2) +; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: bnez t0, .LBB48_12 +; RV64ZVE32F-NEXT: .LBB48_8: +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: bnez t1, .LBB48_13 +; RV64ZVE32F-NEXT: .LBB48_9: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: j .LBB48_14 +; RV64ZVE32F-NEXT: .LBB48_10: +; RV64ZVE32F-NEXT: ld a6, 16(a2) +; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: beqz a7, .LBB48_7 +; RV64ZVE32F-NEXT: .LBB48_11: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v8 ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) -; RV64ZVE32F-NEXT: andi t0, a6, 16 -; RV64ZVE32F-NEXT: bnez t0, .LBB48_13 -; RV64ZVE32F-NEXT: .LBB48_10: -; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a6, 32 -; RV64ZVE32F-NEXT: bnez t1, .LBB48_14 -; RV64ZVE32F-NEXT: .LBB48_11: -; RV64ZVE32F-NEXT: ld t1, 40(a2) -; RV64ZVE32F-NEXT: j .LBB48_15 -; RV64ZVE32F-NEXT: .LBB48_12: -; RV64ZVE32F-NEXT: ld a7, 24(a2) -; RV64ZVE32F-NEXT: andi t0, a6, 16 -; RV64ZVE32F-NEXT: beqz t0, .LBB48_10 -; RV64ZVE32F-NEXT: .LBB48_13: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s t0, v8 +; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: beqz t0, .LBB48_8 +; RV64ZVE32F-NEXT: .LBB48_12: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s t0, v9 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) -; RV64ZVE32F-NEXT: andi t1, a6, 32 -; RV64ZVE32F-NEXT: beqz t1, .LBB48_11 -; RV64ZVE32F-NEXT: .LBB48_14: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s t1, v9 +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: beqz t1, .LBB48_9 +; RV64ZVE32F-NEXT: .LBB48_13: # %cond.load13 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v8 ; RV64ZVE32F-NEXT: slli t1, t1, 3 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: .LBB48_15: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: andi t2, a6, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz t2, .LBB48_18 -; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 +; RV64ZVE32F-NEXT: .LBB48_14: # %else14 +; RV64ZVE32F-NEXT: andi t2, a5, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: beqz t2, .LBB48_17 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) -; RV64ZVE32F-NEXT: andi a6, a6, -128 -; RV64ZVE32F-NEXT: bnez a6, .LBB48_19 -; RV64ZVE32F-NEXT: .LBB48_17: +; RV64ZVE32F-NEXT: andi a5, a5, -128 +; RV64ZVE32F-NEXT: bnez a5, .LBB48_18 +; RV64ZVE32F-NEXT: .LBB48_16: ; RV64ZVE32F-NEXT: ld a1, 56(a2) -; RV64ZVE32F-NEXT: j .LBB48_20 -; RV64ZVE32F-NEXT: .LBB48_18: +; RV64ZVE32F-NEXT: j .LBB48_19 +; RV64ZVE32F-NEXT: .LBB48_17: ; RV64ZVE32F-NEXT: ld t2, 48(a2) -; RV64ZVE32F-NEXT: andi a6, a6, -128 -; RV64ZVE32F-NEXT: beqz a6, .LBB48_17 -; RV64ZVE32F-NEXT: .LBB48_19: # %cond.load19 +; RV64ZVE32F-NEXT: andi a5, a5, -128 +; RV64ZVE32F-NEXT: beqz a5, .LBB48_16 +; RV64ZVE32F-NEXT: .LBB48_18: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) -; RV64ZVE32F-NEXT: .LBB48_20: # %else20 +; RV64ZVE32F-NEXT: .LBB48_19: # %else20 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: sd a4, 8(a0) -; RV64ZVE32F-NEXT: sd a5, 16(a0) +; RV64ZVE32F-NEXT: sd a6, 16(a0) ; RV64ZVE32F-NEXT: sd a7, 24(a0) ; RV64ZVE32F-NEXT: sd t0, 32(a0) ; RV64ZVE32F-NEXT: sd t1, 40(a0) @@ -4639,22 +4657,22 @@ ; RV64ZVE32F-LABEL: mgather_baseidx_sext_v8i8_v8i64: ; RV64ZVE32F: # %bb.0: ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a6, v0 -; RV64ZVE32F-NEXT: andi a3, a6, 1 +; RV64ZVE32F-NEXT: vmv.x.s a5, v0 +; RV64ZVE32F-NEXT: andi a3, a5, 1 ; RV64ZVE32F-NEXT: beqz a3, .LBB49_3 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: ld a3, 0(a3) -; RV64ZVE32F-NEXT: andi a4, a6, 2 +; RV64ZVE32F-NEXT: andi a4, a5, 2 ; RV64ZVE32F-NEXT: bnez a4, .LBB49_4 ; RV64ZVE32F-NEXT: .LBB49_2: ; RV64ZVE32F-NEXT: ld a4, 8(a2) ; RV64ZVE32F-NEXT: j .LBB49_5 ; RV64ZVE32F-NEXT: .LBB49_3: ; RV64ZVE32F-NEXT: ld a3, 0(a2) -; RV64ZVE32F-NEXT: andi a4, a6, 2 +; RV64ZVE32F-NEXT: andi a4, a5, 2 ; RV64ZVE32F-NEXT: beqz a4, .LBB49_2 ; RV64ZVE32F-NEXT: .LBB49_4: # %cond.load1 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma @@ -4664,87 +4682,83 @@ ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) ; RV64ZVE32F-NEXT: .LBB49_5: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: andi a5, a6, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a5, .LBB49_7 +; RV64ZVE32F-NEXT: andi a6, a5, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB49_10 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a5, v9 -; RV64ZVE32F-NEXT: slli a5, a5, 3 -; RV64ZVE32F-NEXT: add a5, a1, a5 -; RV64ZVE32F-NEXT: ld a5, 0(a5) -; RV64ZVE32F-NEXT: j .LBB49_8 +; RV64ZVE32F-NEXT: vmv.x.s a6, v8 +; RV64ZVE32F-NEXT: slli a6, a6, 3 +; RV64ZVE32F-NEXT: add a6, a1, a6 +; RV64ZVE32F-NEXT: ld a6, 0(a6) +; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: bnez a7, .LBB49_11 ; RV64ZVE32F-NEXT: .LBB49_7: -; RV64ZVE32F-NEXT: ld a5, 16(a2) -; RV64ZVE32F-NEXT: .LBB49_8: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: andi a7, a6, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: beqz a7, .LBB49_12 -; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a7, v9 +; RV64ZVE32F-NEXT: ld a7, 24(a2) +; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: bnez t0, .LBB49_12 +; RV64ZVE32F-NEXT: .LBB49_8: +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: bnez t1, .LBB49_13 +; RV64ZVE32F-NEXT: .LBB49_9: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: j .LBB49_14 +; RV64ZVE32F-NEXT: .LBB49_10: +; RV64ZVE32F-NEXT: ld a6, 16(a2) +; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: beqz a7, .LBB49_7 +; RV64ZVE32F-NEXT: .LBB49_11: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v8 ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) -; RV64ZVE32F-NEXT: andi t0, a6, 16 -; RV64ZVE32F-NEXT: bnez t0, .LBB49_13 -; RV64ZVE32F-NEXT: .LBB49_10: -; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a6, 32 -; RV64ZVE32F-NEXT: bnez t1, .LBB49_14 -; RV64ZVE32F-NEXT: .LBB49_11: -; RV64ZVE32F-NEXT: ld t1, 40(a2) -; RV64ZVE32F-NEXT: j .LBB49_15 -; RV64ZVE32F-NEXT: .LBB49_12: -; RV64ZVE32F-NEXT: ld a7, 24(a2) -; RV64ZVE32F-NEXT: andi t0, a6, 16 -; RV64ZVE32F-NEXT: beqz t0, .LBB49_10 -; RV64ZVE32F-NEXT: .LBB49_13: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s t0, v8 +; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: beqz t0, .LBB49_8 +; RV64ZVE32F-NEXT: .LBB49_12: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s t0, v9 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) -; RV64ZVE32F-NEXT: andi t1, a6, 32 -; RV64ZVE32F-NEXT: beqz t1, .LBB49_11 -; RV64ZVE32F-NEXT: .LBB49_14: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s t1, v9 +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: beqz t1, .LBB49_9 +; RV64ZVE32F-NEXT: .LBB49_13: # %cond.load13 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v8 ; RV64ZVE32F-NEXT: slli t1, t1, 3 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: .LBB49_15: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: andi t2, a6, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz t2, .LBB49_18 -; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 +; RV64ZVE32F-NEXT: .LBB49_14: # %else14 +; RV64ZVE32F-NEXT: andi t2, a5, 64 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: beqz t2, .LBB49_17 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) -; RV64ZVE32F-NEXT: andi a6, a6, -128 -; RV64ZVE32F-NEXT: bnez a6, .LBB49_19 -; RV64ZVE32F-NEXT: .LBB49_17: +; RV64ZVE32F-NEXT: andi a5, a5, -128 +; RV64ZVE32F-NEXT: bnez a5, .LBB49_18 +; RV64ZVE32F-NEXT: .LBB49_16: ; RV64ZVE32F-NEXT: ld a1, 56(a2) -; RV64ZVE32F-NEXT: j .LBB49_20 -; RV64ZVE32F-NEXT: .LBB49_18: +; RV64ZVE32F-NEXT: j .LBB49_19 +; RV64ZVE32F-NEXT: .LBB49_17: ; RV64ZVE32F-NEXT: ld t2, 48(a2) -; RV64ZVE32F-NEXT: andi a6, a6, -128 -; RV64ZVE32F-NEXT: beqz a6, .LBB49_17 -; RV64ZVE32F-NEXT: .LBB49_19: # %cond.load19 +; RV64ZVE32F-NEXT: andi a5, a5, -128 +; RV64ZVE32F-NEXT: beqz a5, .LBB49_16 +; RV64ZVE32F-NEXT: .LBB49_18: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) -; RV64ZVE32F-NEXT: .LBB49_20: # %else20 +; RV64ZVE32F-NEXT: .LBB49_19: # %else20 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: sd a4, 8(a0) -; RV64ZVE32F-NEXT: sd a5, 16(a0) +; RV64ZVE32F-NEXT: sd a6, 16(a0) ; RV64ZVE32F-NEXT: sd a7, 24(a0) ; RV64ZVE32F-NEXT: sd t0, 32(a0) ; RV64ZVE32F-NEXT: sd t1, 40(a0) @@ -4945,90 +4959,86 @@ ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) ; RV64ZVE32F-NEXT: .LBB50_5: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a6, a5, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a6, .LBB50_7 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB50_10 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a6, v9 +; RV64ZVE32F-NEXT: vmv.x.s a6, v8 ; RV64ZVE32F-NEXT: andi a6, a6, 255 ; RV64ZVE32F-NEXT: slli a6, a6, 3 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) -; RV64ZVE32F-NEXT: j .LBB50_8 +; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: bnez a7, .LBB50_11 ; RV64ZVE32F-NEXT: .LBB50_7: +; RV64ZVE32F-NEXT: ld a7, 24(a2) +; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: bnez t0, .LBB50_12 +; RV64ZVE32F-NEXT: .LBB50_8: +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: bnez t1, .LBB50_13 +; RV64ZVE32F-NEXT: .LBB50_9: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: j .LBB50_14 +; RV64ZVE32F-NEXT: .LBB50_10: ; RV64ZVE32F-NEXT: ld a6, 16(a2) -; RV64ZVE32F-NEXT: .LBB50_8: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a7, a5, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: beqz a7, .LBB50_12 -; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a7, v9 +; RV64ZVE32F-NEXT: beqz a7, .LBB50_7 +; RV64ZVE32F-NEXT: .LBB50_11: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v8 ; RV64ZVE32F-NEXT: andi a7, a7, 255 ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) ; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: bnez t0, .LBB50_13 -; RV64ZVE32F-NEXT: .LBB50_10: -; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: bnez t1, .LBB50_14 -; RV64ZVE32F-NEXT: .LBB50_11: -; RV64ZVE32F-NEXT: ld t1, 40(a2) -; RV64ZVE32F-NEXT: j .LBB50_15 -; RV64ZVE32F-NEXT: .LBB50_12: -; RV64ZVE32F-NEXT: ld a7, 24(a2) -; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: beqz t0, .LBB50_10 -; RV64ZVE32F-NEXT: .LBB50_13: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s t0, v8 +; RV64ZVE32F-NEXT: beqz t0, .LBB50_8 +; RV64ZVE32F-NEXT: .LBB50_12: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s t0, v9 ; RV64ZVE32F-NEXT: andi t0, t0, 255 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) ; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: beqz t1, .LBB50_11 -; RV64ZVE32F-NEXT: .LBB50_14: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s t1, v9 +; RV64ZVE32F-NEXT: beqz t1, .LBB50_9 +; RV64ZVE32F-NEXT: .LBB50_13: # %cond.load13 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v8 ; RV64ZVE32F-NEXT: andi t1, t1, 255 ; RV64ZVE32F-NEXT: slli t1, t1, 3 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: .LBB50_15: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: .LBB50_14: # %else14 ; RV64ZVE32F-NEXT: andi t2, a5, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz t2, .LBB50_18 -; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: beqz t2, .LBB50_17 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: andi t2, t2, 255 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB50_19 -; RV64ZVE32F-NEXT: .LBB50_17: +; RV64ZVE32F-NEXT: bnez a5, .LBB50_18 +; RV64ZVE32F-NEXT: .LBB50_16: ; RV64ZVE32F-NEXT: ld a1, 56(a2) -; RV64ZVE32F-NEXT: j .LBB50_20 -; RV64ZVE32F-NEXT: .LBB50_18: +; RV64ZVE32F-NEXT: j .LBB50_19 +; RV64ZVE32F-NEXT: .LBB50_17: ; RV64ZVE32F-NEXT: ld t2, 48(a2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB50_17 -; RV64ZVE32F-NEXT: .LBB50_19: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a5, .LBB50_16 +; RV64ZVE32F-NEXT: .LBB50_18: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) -; RV64ZVE32F-NEXT: .LBB50_20: # %else20 +; RV64ZVE32F-NEXT: .LBB50_19: # %else20 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: sd a4, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) @@ -5231,84 +5241,80 @@ ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) ; RV64ZVE32F-NEXT: .LBB51_5: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a6, a5, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a6, .LBB51_7 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB51_10 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a6, v9 +; RV64ZVE32F-NEXT: vmv.x.s a6, v8 ; RV64ZVE32F-NEXT: slli a6, a6, 3 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) -; RV64ZVE32F-NEXT: j .LBB51_8 +; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: bnez a7, .LBB51_11 ; RV64ZVE32F-NEXT: .LBB51_7: +; RV64ZVE32F-NEXT: ld a7, 24(a2) +; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: bnez t0, .LBB51_12 +; RV64ZVE32F-NEXT: .LBB51_8: +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: bnez t1, .LBB51_13 +; RV64ZVE32F-NEXT: .LBB51_9: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: j .LBB51_14 +; RV64ZVE32F-NEXT: .LBB51_10: ; RV64ZVE32F-NEXT: ld a6, 16(a2) -; RV64ZVE32F-NEXT: .LBB51_8: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: andi a7, a5, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: beqz a7, .LBB51_12 -; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a7, v9 +; RV64ZVE32F-NEXT: beqz a7, .LBB51_7 +; RV64ZVE32F-NEXT: .LBB51_11: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v8 ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) ; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: bnez t0, .LBB51_13 -; RV64ZVE32F-NEXT: .LBB51_10: -; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: bnez t1, .LBB51_14 -; RV64ZVE32F-NEXT: .LBB51_11: -; RV64ZVE32F-NEXT: ld t1, 40(a2) -; RV64ZVE32F-NEXT: j .LBB51_15 -; RV64ZVE32F-NEXT: .LBB51_12: -; RV64ZVE32F-NEXT: ld a7, 24(a2) -; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: beqz t0, .LBB51_10 -; RV64ZVE32F-NEXT: .LBB51_13: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s t0, v8 +; RV64ZVE32F-NEXT: beqz t0, .LBB51_8 +; RV64ZVE32F-NEXT: .LBB51_12: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s t0, v9 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) ; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: beqz t1, .LBB51_11 -; RV64ZVE32F-NEXT: .LBB51_14: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s t1, v9 +; RV64ZVE32F-NEXT: beqz t1, .LBB51_9 +; RV64ZVE32F-NEXT: .LBB51_13: # %cond.load13 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v8 ; RV64ZVE32F-NEXT: slli t1, t1, 3 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: .LBB51_15: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: .LBB51_14: # %else14 ; RV64ZVE32F-NEXT: andi t2, a5, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz t2, .LBB51_18 -; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: beqz t2, .LBB51_17 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB51_19 -; RV64ZVE32F-NEXT: .LBB51_17: +; RV64ZVE32F-NEXT: bnez a5, .LBB51_18 +; RV64ZVE32F-NEXT: .LBB51_16: ; RV64ZVE32F-NEXT: ld a1, 56(a2) -; RV64ZVE32F-NEXT: j .LBB51_20 -; RV64ZVE32F-NEXT: .LBB51_18: +; RV64ZVE32F-NEXT: j .LBB51_19 +; RV64ZVE32F-NEXT: .LBB51_17: ; RV64ZVE32F-NEXT: ld t2, 48(a2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB51_17 -; RV64ZVE32F-NEXT: .LBB51_19: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a5, .LBB51_16 +; RV64ZVE32F-NEXT: .LBB51_18: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) -; RV64ZVE32F-NEXT: .LBB51_20: # %else20 +; RV64ZVE32F-NEXT: .LBB51_19: # %else20 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: sd a4, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) @@ -5510,84 +5516,80 @@ ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) ; RV64ZVE32F-NEXT: .LBB52_5: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a6, a5, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a6, .LBB52_7 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a6, .LBB52_10 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a6, v9 +; RV64ZVE32F-NEXT: vmv.x.s a6, v8 ; RV64ZVE32F-NEXT: slli a6, a6, 3 ; RV64ZVE32F-NEXT: add a6, a1, a6 ; RV64ZVE32F-NEXT: ld a6, 0(a6) -; RV64ZVE32F-NEXT: j .LBB52_8 +; RV64ZVE32F-NEXT: andi a7, a5, 8 +; RV64ZVE32F-NEXT: bnez a7, .LBB52_11 ; RV64ZVE32F-NEXT: .LBB52_7: +; RV64ZVE32F-NEXT: ld a7, 24(a2) +; RV64ZVE32F-NEXT: andi t0, a5, 16 +; RV64ZVE32F-NEXT: bnez t0, .LBB52_12 +; RV64ZVE32F-NEXT: .LBB52_8: +; RV64ZVE32F-NEXT: ld t0, 32(a2) +; RV64ZVE32F-NEXT: andi t1, a5, 32 +; RV64ZVE32F-NEXT: bnez t1, .LBB52_13 +; RV64ZVE32F-NEXT: .LBB52_9: +; RV64ZVE32F-NEXT: ld t1, 40(a2) +; RV64ZVE32F-NEXT: j .LBB52_14 +; RV64ZVE32F-NEXT: .LBB52_10: ; RV64ZVE32F-NEXT: ld a6, 16(a2) -; RV64ZVE32F-NEXT: .LBB52_8: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: andi a7, a5, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: beqz a7, .LBB52_12 -; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a7, v9 +; RV64ZVE32F-NEXT: beqz a7, .LBB52_7 +; RV64ZVE32F-NEXT: .LBB52_11: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a7, v8 ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) ; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: bnez t0, .LBB52_13 -; RV64ZVE32F-NEXT: .LBB52_10: -; RV64ZVE32F-NEXT: ld t0, 32(a2) -; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: bnez t1, .LBB52_14 -; RV64ZVE32F-NEXT: .LBB52_11: -; RV64ZVE32F-NEXT: ld t1, 40(a2) -; RV64ZVE32F-NEXT: j .LBB52_15 -; RV64ZVE32F-NEXT: .LBB52_12: -; RV64ZVE32F-NEXT: ld a7, 24(a2) -; RV64ZVE32F-NEXT: andi t0, a5, 16 -; RV64ZVE32F-NEXT: beqz t0, .LBB52_10 -; RV64ZVE32F-NEXT: .LBB52_13: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s t0, v8 +; RV64ZVE32F-NEXT: beqz t0, .LBB52_8 +; RV64ZVE32F-NEXT: .LBB52_12: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s t0, v9 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) ; RV64ZVE32F-NEXT: andi t1, a5, 32 -; RV64ZVE32F-NEXT: beqz t1, .LBB52_11 -; RV64ZVE32F-NEXT: .LBB52_14: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s t1, v9 +; RV64ZVE32F-NEXT: beqz t1, .LBB52_9 +; RV64ZVE32F-NEXT: .LBB52_13: # %cond.load13 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s t1, v8 ; RV64ZVE32F-NEXT: slli t1, t1, 3 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) -; RV64ZVE32F-NEXT: .LBB52_15: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: .LBB52_14: # %else14 ; RV64ZVE32F-NEXT: andi t2, a5, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz t2, .LBB52_18 -; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: beqz t2, .LBB52_17 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: bnez a5, .LBB52_19 -; RV64ZVE32F-NEXT: .LBB52_17: +; RV64ZVE32F-NEXT: bnez a5, .LBB52_18 +; RV64ZVE32F-NEXT: .LBB52_16: ; RV64ZVE32F-NEXT: ld a1, 56(a2) -; RV64ZVE32F-NEXT: j .LBB52_20 -; RV64ZVE32F-NEXT: .LBB52_18: +; RV64ZVE32F-NEXT: j .LBB52_19 +; RV64ZVE32F-NEXT: .LBB52_17: ; RV64ZVE32F-NEXT: ld t2, 48(a2) ; RV64ZVE32F-NEXT: andi a5, a5, -128 -; RV64ZVE32F-NEXT: beqz a5, .LBB52_17 -; RV64ZVE32F-NEXT: .LBB52_19: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a5, .LBB52_16 +; RV64ZVE32F-NEXT: .LBB52_18: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) -; RV64ZVE32F-NEXT: .LBB52_20: # %else20 +; RV64ZVE32F-NEXT: .LBB52_19: # %else20 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: sd a4, 8(a0) ; RV64ZVE32F-NEXT: sd a6, 16(a0) @@ -5794,90 +5796,86 @@ ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: ld a4, 0(a4) ; RV64ZVE32F-NEXT: .LBB53_5: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a7, a6, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a7, .LBB53_7 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: beqz a7, .LBB53_10 ; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a7, v9 +; RV64ZVE32F-NEXT: vmv.x.s a7, v8 ; RV64ZVE32F-NEXT: and a7, a7, a5 ; RV64ZVE32F-NEXT: slli a7, a7, 3 ; RV64ZVE32F-NEXT: add a7, a1, a7 ; RV64ZVE32F-NEXT: ld a7, 0(a7) -; RV64ZVE32F-NEXT: j .LBB53_8 +; RV64ZVE32F-NEXT: andi t0, a6, 8 +; RV64ZVE32F-NEXT: bnez t0, .LBB53_11 ; RV64ZVE32F-NEXT: .LBB53_7: +; RV64ZVE32F-NEXT: ld t0, 24(a2) +; RV64ZVE32F-NEXT: andi t1, a6, 16 +; RV64ZVE32F-NEXT: bnez t1, .LBB53_12 +; RV64ZVE32F-NEXT: .LBB53_8: +; RV64ZVE32F-NEXT: ld t1, 32(a2) +; RV64ZVE32F-NEXT: andi t2, a6, 32 +; RV64ZVE32F-NEXT: bnez t2, .LBB53_13 +; RV64ZVE32F-NEXT: .LBB53_9: +; RV64ZVE32F-NEXT: ld t2, 40(a2) +; RV64ZVE32F-NEXT: j .LBB53_14 +; RV64ZVE32F-NEXT: .LBB53_10: ; RV64ZVE32F-NEXT: ld a7, 16(a2) -; RV64ZVE32F-NEXT: .LBB53_8: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma ; RV64ZVE32F-NEXT: andi t0, a6, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 -; RV64ZVE32F-NEXT: beqz t0, .LBB53_12 -; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s t0, v9 +; RV64ZVE32F-NEXT: beqz t0, .LBB53_7 +; RV64ZVE32F-NEXT: .LBB53_11: # %cond.load7 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s t0, v8 ; RV64ZVE32F-NEXT: and t0, t0, a5 ; RV64ZVE32F-NEXT: slli t0, t0, 3 ; RV64ZVE32F-NEXT: add t0, a1, t0 ; RV64ZVE32F-NEXT: ld t0, 0(t0) ; RV64ZVE32F-NEXT: andi t1, a6, 16 -; RV64ZVE32F-NEXT: bnez t1, .LBB53_13 -; RV64ZVE32F-NEXT: .LBB53_10: -; RV64ZVE32F-NEXT: ld t1, 32(a2) -; RV64ZVE32F-NEXT: andi t2, a6, 32 -; RV64ZVE32F-NEXT: bnez t2, .LBB53_14 -; RV64ZVE32F-NEXT: .LBB53_11: -; RV64ZVE32F-NEXT: ld t2, 40(a2) -; RV64ZVE32F-NEXT: j .LBB53_15 -; RV64ZVE32F-NEXT: .LBB53_12: -; RV64ZVE32F-NEXT: ld t0, 24(a2) -; RV64ZVE32F-NEXT: andi t1, a6, 16 -; RV64ZVE32F-NEXT: beqz t1, .LBB53_10 -; RV64ZVE32F-NEXT: .LBB53_13: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s t1, v8 +; RV64ZVE32F-NEXT: beqz t1, .LBB53_8 +; RV64ZVE32F-NEXT: .LBB53_12: # %cond.load10 +; RV64ZVE32F-NEXT: vmv.x.s t1, v9 ; RV64ZVE32F-NEXT: and t1, t1, a5 ; RV64ZVE32F-NEXT: slli t1, t1, 3 ; RV64ZVE32F-NEXT: add t1, a1, t1 ; RV64ZVE32F-NEXT: ld t1, 0(t1) ; RV64ZVE32F-NEXT: andi t2, a6, 32 -; RV64ZVE32F-NEXT: beqz t2, .LBB53_11 -; RV64ZVE32F-NEXT: .LBB53_14: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s t2, v9 +; RV64ZVE32F-NEXT: beqz t2, .LBB53_9 +; RV64ZVE32F-NEXT: .LBB53_13: # %cond.load13 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s t2, v8 ; RV64ZVE32F-NEXT: and t2, t2, a5 ; RV64ZVE32F-NEXT: slli t2, t2, 3 ; RV64ZVE32F-NEXT: add t2, a1, t2 ; RV64ZVE32F-NEXT: ld t2, 0(t2) -; RV64ZVE32F-NEXT: .LBB53_15: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: .LBB53_14: # %else14 ; RV64ZVE32F-NEXT: andi t3, a6, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz t3, .LBB53_18 -; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: beqz t3, .LBB53_17 +; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s t3, v8 ; RV64ZVE32F-NEXT: and t3, t3, a5 ; RV64ZVE32F-NEXT: slli t3, t3, 3 ; RV64ZVE32F-NEXT: add t3, a1, t3 ; RV64ZVE32F-NEXT: ld t3, 0(t3) ; RV64ZVE32F-NEXT: andi a6, a6, -128 -; RV64ZVE32F-NEXT: bnez a6, .LBB53_19 -; RV64ZVE32F-NEXT: .LBB53_17: +; RV64ZVE32F-NEXT: bnez a6, .LBB53_18 +; RV64ZVE32F-NEXT: .LBB53_16: ; RV64ZVE32F-NEXT: ld a1, 56(a2) -; RV64ZVE32F-NEXT: j .LBB53_20 -; RV64ZVE32F-NEXT: .LBB53_18: +; RV64ZVE32F-NEXT: j .LBB53_19 +; RV64ZVE32F-NEXT: .LBB53_17: ; RV64ZVE32F-NEXT: ld t3, 48(a2) ; RV64ZVE32F-NEXT: andi a6, a6, -128 -; RV64ZVE32F-NEXT: beqz a6, .LBB53_17 -; RV64ZVE32F-NEXT: .LBB53_19: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a6, .LBB53_16 +; RV64ZVE32F-NEXT: .LBB53_18: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: and a2, a2, a5 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: ld a1, 0(a1) -; RV64ZVE32F-NEXT: .LBB53_20: # %else20 +; RV64ZVE32F-NEXT: .LBB53_19: # %else20 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: sd a4, 8(a0) ; RV64ZVE32F-NEXT: sd a7, 16(a0) @@ -7403,74 +7401,76 @@ ; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB64_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB64_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v11, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 -; RV64ZVE32F-NEXT: .LBB64_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB64_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB64_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB64_6: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB64_14 -; RV64ZVE32F-NEXT: .LBB64_8: # %else11 +; RV64ZVE32F-NEXT: .LBB64_7: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB64_10 -; RV64ZVE32F-NEXT: .LBB64_9: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB64_9 +; RV64ZVE32F-NEXT: .LBB64_8: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 -; RV64ZVE32F-NEXT: .LBB64_10: # %else14 +; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5 +; RV64ZVE32F-NEXT: .LBB64_9: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB64_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.10: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB64_16 -; RV64ZVE32F-NEXT: .LBB64_12: # %else20 +; RV64ZVE32F-NEXT: .LBB64_11: # %else20 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB64_13: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: .LBB64_12: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB64_8 -; RV64ZVE32F-NEXT: .LBB64_14: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vfmv.s.f v11, fa5 +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB64_6 +; RV64ZVE32F-NEXT: .LBB64_13: # %cond.load7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB64_7 +; RV64ZVE32F-NEXT: .LBB64_14: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh fa5, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB64_9 -; RV64ZVE32F-NEXT: j .LBB64_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB64_8 +; RV64ZVE32F-NEXT: j .LBB64_9 ; RV64ZVE32F-NEXT: .LBB64_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 @@ -7480,7 +7480,7 @@ ; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB64_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB64_11 ; RV64ZVE32F-NEXT: .LBB64_16: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -7546,74 +7546,76 @@ ; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB65_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB65_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v11, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 -; RV64ZVE32F-NEXT: .LBB65_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB65_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB65_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB65_6: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB65_14 -; RV64ZVE32F-NEXT: .LBB65_8: # %else11 +; RV64ZVE32F-NEXT: .LBB65_7: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB65_10 -; RV64ZVE32F-NEXT: .LBB65_9: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB65_9 +; RV64ZVE32F-NEXT: .LBB65_8: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 -; RV64ZVE32F-NEXT: .LBB65_10: # %else14 +; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5 +; RV64ZVE32F-NEXT: .LBB65_9: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB65_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.10: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB65_16 -; RV64ZVE32F-NEXT: .LBB65_12: # %else20 +; RV64ZVE32F-NEXT: .LBB65_11: # %else20 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB65_12: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh fa5, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vfmv.s.f v11, fa5 +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB65_6 ; RV64ZVE32F-NEXT: .LBB65_13: # %cond.load7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB65_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB65_7 ; RV64ZVE32F-NEXT: .LBB65_14: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB65_9 -; RV64ZVE32F-NEXT: j .LBB65_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB65_8 +; RV64ZVE32F-NEXT: j .LBB65_9 ; RV64ZVE32F-NEXT: .LBB65_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 @@ -7623,7 +7625,7 @@ ; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB65_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB65_11 ; RV64ZVE32F-NEXT: .LBB65_16: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -7692,78 +7694,80 @@ ; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB66_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB66_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: andi a2, a2, 255 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v11, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 -; RV64ZVE32F-NEXT: .LBB66_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB66_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB66_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB66_6: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB66_14 -; RV64ZVE32F-NEXT: .LBB66_8: # %else11 +; RV64ZVE32F-NEXT: .LBB66_7: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB66_10 -; RV64ZVE32F-NEXT: .LBB66_9: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB66_9 +; RV64ZVE32F-NEXT: .LBB66_8: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 -; RV64ZVE32F-NEXT: .LBB66_10: # %else14 +; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5 +; RV64ZVE32F-NEXT: .LBB66_9: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB66_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.10: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB66_16 -; RV64ZVE32F-NEXT: .LBB66_12: # %else20 +; RV64ZVE32F-NEXT: .LBB66_11: # %else20 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB66_12: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh fa5, 0(a2) +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vfmv.s.f v11, fa5 +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB66_6 ; RV64ZVE32F-NEXT: .LBB66_13: # %cond.load7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB66_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB66_7 ; RV64ZVE32F-NEXT: .LBB66_14: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB66_9 -; RV64ZVE32F-NEXT: j .LBB66_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB66_8 +; RV64ZVE32F-NEXT: j .LBB66_9 ; RV64ZVE32F-NEXT: .LBB66_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 @@ -7774,7 +7778,7 @@ ; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB66_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB66_11 ; RV64ZVE32F-NEXT: .LBB66_16: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -7841,73 +7845,75 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB67_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB67_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vfmv.s.f v11, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 -; RV64ZVE32F-NEXT: .LBB67_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB67_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB67_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB67_6: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB67_14 -; RV64ZVE32F-NEXT: .LBB67_8: # %else11 +; RV64ZVE32F-NEXT: .LBB67_7: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB67_10 -; RV64ZVE32F-NEXT: .LBB67_9: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB67_9 +; RV64ZVE32F-NEXT: .LBB67_8: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 +; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5 -; RV64ZVE32F-NEXT: .LBB67_10: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5 +; RV64ZVE32F-NEXT: .LBB67_9: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB67_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.10: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB67_16 -; RV64ZVE32F-NEXT: .LBB67_12: # %else20 +; RV64ZVE32F-NEXT: .LBB67_11: # %else20 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB67_12: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flh fa5, 0(a2) +; RV64ZVE32F-NEXT: vfmv.s.f v11, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB67_6 ; RV64ZVE32F-NEXT: .LBB67_13: # %cond.load7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 +; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB67_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB67_7 ; RV64ZVE32F-NEXT: .LBB67_14: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flh fa5, 0(a2) -; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4 +; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB67_9 -; RV64ZVE32F-NEXT: j .LBB67_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB67_8 +; RV64ZVE32F-NEXT: j .LBB67_9 ; RV64ZVE32F-NEXT: .LBB67_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 1 @@ -7917,7 +7923,7 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB67_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB67_11 ; RV64ZVE32F-NEXT: .LBB67_16: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -8337,56 +8343,58 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 ; RV64ZVE32F-NEXT: .LBB74_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB74_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 -; RV64ZVE32F-NEXT: .LBB74_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB74_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB74_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB74_6: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB74_14 -; RV64ZVE32F-NEXT: .LBB74_8: # %else11 +; RV64ZVE32F-NEXT: .LBB74_7: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB74_10 -; RV64ZVE32F-NEXT: .LBB74_9: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB74_9 +; RV64ZVE32F-NEXT: .LBB74_8: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 +; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB74_10: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5 +; RV64ZVE32F-NEXT: .LBB74_9: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB74_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.10: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB74_16 -; RV64ZVE32F-NEXT: .LBB74_12: # %else20 +; RV64ZVE32F-NEXT: .LBB74_11: # %else20 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB74_12: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw fa5, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB74_6 ; RV64ZVE32F-NEXT: .LBB74_13: # %cond.load7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) @@ -8395,19 +8403,19 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB74_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB74_7 ; RV64ZVE32F-NEXT: .LBB74_14: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB74_9 -; RV64ZVE32F-NEXT: j .LBB74_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB74_8 +; RV64ZVE32F-NEXT: j .LBB74_9 ; RV64ZVE32F-NEXT: .LBB74_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -8418,7 +8426,7 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB74_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB74_11 ; RV64ZVE32F-NEXT: .LBB74_16: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -8485,56 +8493,58 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 ; RV64ZVE32F-NEXT: .LBB75_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB75_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 -; RV64ZVE32F-NEXT: .LBB75_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB75_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB75_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB75_6: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB75_14 -; RV64ZVE32F-NEXT: .LBB75_8: # %else11 +; RV64ZVE32F-NEXT: .LBB75_7: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB75_10 -; RV64ZVE32F-NEXT: .LBB75_9: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB75_9 +; RV64ZVE32F-NEXT: .LBB75_8: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 +; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB75_10: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5 +; RV64ZVE32F-NEXT: .LBB75_9: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB75_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.10: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB75_16 -; RV64ZVE32F-NEXT: .LBB75_12: # %else20 +; RV64ZVE32F-NEXT: .LBB75_11: # %else20 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB75_12: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw fa5, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB75_6 ; RV64ZVE32F-NEXT: .LBB75_13: # %cond.load7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) @@ -8543,19 +8553,19 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB75_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB75_7 ; RV64ZVE32F-NEXT: .LBB75_14: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB75_9 -; RV64ZVE32F-NEXT: j .LBB75_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB75_8 +; RV64ZVE32F-NEXT: j .LBB75_9 ; RV64ZVE32F-NEXT: .LBB75_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -8566,7 +8576,7 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB75_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB75_11 ; RV64ZVE32F-NEXT: .LBB75_16: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -8636,81 +8646,83 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 ; RV64ZVE32F-NEXT: .LBB76_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB76_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: andi a2, a2, 255 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 -; RV64ZVE32F-NEXT: .LBB76_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB76_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB76_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB76_6: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB76_14 -; RV64ZVE32F-NEXT: .LBB76_8: # %else11 +; RV64ZVE32F-NEXT: .LBB76_7: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB76_10 -; RV64ZVE32F-NEXT: .LBB76_9: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB76_9 +; RV64ZVE32F-NEXT: .LBB76_8: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 +; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB76_10: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5 +; RV64ZVE32F-NEXT: .LBB76_9: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB76_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.10: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB76_16 -; RV64ZVE32F-NEXT: .LBB76_12: # %else20 +; RV64ZVE32F-NEXT: .LBB76_11: # %else20 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB76_12: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw fa5, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB76_6 ; RV64ZVE32F-NEXT: .LBB76_13: # %cond.load7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 +; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB76_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB76_7 ; RV64ZVE32F-NEXT: .LBB76_14: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB76_9 -; RV64ZVE32F-NEXT: j .LBB76_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB76_8 +; RV64ZVE32F-NEXT: j .LBB76_9 ; RV64ZVE32F-NEXT: .LBB76_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 @@ -8722,7 +8734,7 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB76_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB76_11 ; RV64ZVE32F-NEXT: .LBB76_16: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -8792,56 +8804,58 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 ; RV64ZVE32F-NEXT: .LBB77_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB77_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 -; RV64ZVE32F-NEXT: .LBB77_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB77_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB77_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB77_6: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB77_14 -; RV64ZVE32F-NEXT: .LBB77_8: # %else11 +; RV64ZVE32F-NEXT: .LBB77_7: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB77_10 -; RV64ZVE32F-NEXT: .LBB77_9: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB77_9 +; RV64ZVE32F-NEXT: .LBB77_8: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 +; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB77_10: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5 +; RV64ZVE32F-NEXT: .LBB77_9: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB77_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.10: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB77_16 -; RV64ZVE32F-NEXT: .LBB77_12: # %else20 +; RV64ZVE32F-NEXT: .LBB77_11: # %else20 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB77_12: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw fa5, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB77_6 ; RV64ZVE32F-NEXT: .LBB77_13: # %cond.load7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) @@ -8850,19 +8864,19 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB77_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB77_7 ; RV64ZVE32F-NEXT: .LBB77_14: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB77_9 -; RV64ZVE32F-NEXT: j .LBB77_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB77_8 +; RV64ZVE32F-NEXT: j .LBB77_9 ; RV64ZVE32F-NEXT: .LBB77_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -8873,7 +8887,7 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB77_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB77_11 ; RV64ZVE32F-NEXT: .LBB77_16: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -8940,57 +8954,59 @@ ; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 -; RV64ZVE32F-NEXT: .LBB78_4: # %else2 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB78_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: flw fa5, 0(a2) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 -; RV64ZVE32F-NEXT: .LBB78_6: # %else5 +; RV64ZVE32F-NEXT: .LBB78_4: # %else2 ; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: andi a2, a1, 4 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB78_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB78_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB78_6: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB78_14 -; RV64ZVE32F-NEXT: .LBB78_8: # %else11 +; RV64ZVE32F-NEXT: .LBB78_7: # %else11 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB78_10 -; RV64ZVE32F-NEXT: .LBB78_9: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a2, .LBB78_9 +; RV64ZVE32F-NEXT: .LBB78_8: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 +; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB78_10: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5 +; RV64ZVE32F-NEXT: .LBB78_9: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB78_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.10: # %else17 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB78_16 -; RV64ZVE32F-NEXT: .LBB78_12: # %else20 +; RV64ZVE32F-NEXT: .LBB78_11: # %else20 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB78_12: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: flw fa5, 0(a2) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB78_6 ; RV64ZVE32F-NEXT: .LBB78_13: # %cond.load7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) @@ -8999,19 +9015,19 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB78_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB78_7 ; RV64ZVE32F-NEXT: .LBB78_14: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: flw fa5, 0(a2) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB78_9 -; RV64ZVE32F-NEXT: j .LBB78_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB78_8 +; RV64ZVE32F-NEXT: j .LBB78_9 ; RV64ZVE32F-NEXT: .LBB78_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -9022,7 +9038,7 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB78_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB78_11 ; RV64ZVE32F-NEXT: .LBB78_16: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -9095,58 +9111,60 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 ; RV64ZVE32F-NEXT: .LBB79_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a3, a2, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB79_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-NEXT: and a3, a3, a1 -; RV64ZVE32F-NEXT: slli a3, a3, 2 -; RV64ZVE32F-NEXT: add a3, a0, a3 -; RV64ZVE32F-NEXT: flw fa5, 0(a3) -; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 -; RV64ZVE32F-NEXT: .LBB79_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB79_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a3, .LBB79_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB79_6: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 16 ; RV64ZVE32F-NEXT: bnez a3, .LBB79_14 -; RV64ZVE32F-NEXT: .LBB79_8: # %else11 +; RV64ZVE32F-NEXT: .LBB79_7: # %else11 ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: beqz a3, .LBB79_10 -; RV64ZVE32F-NEXT: .LBB79_9: # %cond.load13 +; RV64ZVE32F-NEXT: beqz a3, .LBB79_9 +; RV64ZVE32F-NEXT: .LBB79_8: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: flw fa5, 0(a3) ; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 +; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 ; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5 -; RV64ZVE32F-NEXT: .LBB79_10: # %else14 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5 +; RV64ZVE32F-NEXT: .LBB79_9: # %else14 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a3, .LBB79_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else17 +; RV64ZVE32F-NEXT: # %bb.10: # %else17 ; RV64ZVE32F-NEXT: andi a2, a2, -128 ; RV64ZVE32F-NEXT: bnez a2, .LBB79_16 -; RV64ZVE32F-NEXT: .LBB79_12: # %else20 +; RV64ZVE32F-NEXT: .LBB79_11: # %else20 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB79_12: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: flw fa5, 0(a3) +; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2 +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: beqz a3, .LBB79_6 ; RV64ZVE32F-NEXT: .LBB79_13: # %cond.load7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 @@ -9156,20 +9174,20 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3 ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB79_8 +; RV64ZVE32F-NEXT: beqz a3, .LBB79_7 ; RV64ZVE32F-NEXT: .LBB79_14: # %cond.load10 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: flw fa5, 0(a3) ; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma -; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5 -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4 +; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5 +; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4 ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB79_9 -; RV64ZVE32F-NEXT: j .LBB79_10 +; RV64ZVE32F-NEXT: bnez a3, .LBB79_8 +; RV64ZVE32F-NEXT: j .LBB79_9 ; RV64ZVE32F-NEXT: .LBB79_15: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: and a3, a3, a1 @@ -9181,7 +9199,7 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6 ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB79_12 +; RV64ZVE32F-NEXT: beqz a2, .LBB79_11 ; RV64ZVE32F-NEXT: .LBB79_16: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 @@ -10024,53 +10042,46 @@ ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa1, 0(a3) ; RV64ZVE32F-NEXT: .LBB87_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a3, a2, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB87_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa2, 0(a3) -; RV64ZVE32F-NEXT: .LBB87_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB87_14 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a3, .LBB87_15 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB87_6: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 16 ; RV64ZVE32F-NEXT: bnez a3, .LBB87_16 -; RV64ZVE32F-NEXT: .LBB87_8: # %else11 +; RV64ZVE32F-NEXT: .LBB87_7: # %else11 ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: beqz a3, .LBB87_10 -; RV64ZVE32F-NEXT: .LBB87_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: beqz a3, .LBB87_9 +; RV64ZVE32F-NEXT: .LBB87_8: # %cond.load13 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) -; RV64ZVE32F-NEXT: .LBB87_10: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: .LBB87_9: # %else14 ; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB87_12 -; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB87_11 +; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa6, 0(a3) -; RV64ZVE32F-NEXT: .LBB87_12: # %else17 +; RV64ZVE32F-NEXT: .LBB87_11: # %else17 ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB87_14 -; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a2, .LBB87_13 +; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: fld fa7, 0(a1) -; RV64ZVE32F-NEXT: .LBB87_14: # %else20 +; RV64ZVE32F-NEXT: .LBB87_13: # %else20 ; RV64ZVE32F-NEXT: fsd fa0, 0(a0) ; RV64ZVE32F-NEXT: fsd fa1, 8(a0) ; RV64ZVE32F-NEXT: fsd fa2, 16(a0) @@ -10080,24 +10091,29 @@ ; RV64ZVE32F-NEXT: fsd fa6, 48(a0) ; RV64ZVE32F-NEXT: fsd fa7, 56(a0) ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB87_14: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa2, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: beqz a3, .LBB87_6 ; RV64ZVE32F-NEXT: .LBB87_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa3, 0(a3) ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB87_8 +; RV64ZVE32F-NEXT: beqz a3, .LBB87_7 ; RV64ZVE32F-NEXT: .LBB87_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa4, 0(a3) ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB87_9 -; RV64ZVE32F-NEXT: j .LBB87_10 +; RV64ZVE32F-NEXT: bnez a3, .LBB87_8 +; RV64ZVE32F-NEXT: j .LBB87_9 %ptrs = getelementptr inbounds double, ptr %base, <8 x i8> %idxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) ret <8 x double> %v @@ -10241,53 +10257,46 @@ ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa1, 0(a3) ; RV64ZVE32F-NEXT: .LBB88_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a3, a2, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB88_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa2, 0(a3) -; RV64ZVE32F-NEXT: .LBB88_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB88_14 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a3, .LBB88_15 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB88_6: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 16 ; RV64ZVE32F-NEXT: bnez a3, .LBB88_16 -; RV64ZVE32F-NEXT: .LBB88_8: # %else11 +; RV64ZVE32F-NEXT: .LBB88_7: # %else11 ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: beqz a3, .LBB88_10 -; RV64ZVE32F-NEXT: .LBB88_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: beqz a3, .LBB88_9 +; RV64ZVE32F-NEXT: .LBB88_8: # %cond.load13 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) -; RV64ZVE32F-NEXT: .LBB88_10: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: .LBB88_9: # %else14 ; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB88_12 -; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB88_11 +; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa6, 0(a3) -; RV64ZVE32F-NEXT: .LBB88_12: # %else17 +; RV64ZVE32F-NEXT: .LBB88_11: # %else17 ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB88_14 -; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a2, .LBB88_13 +; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: fld fa7, 0(a1) -; RV64ZVE32F-NEXT: .LBB88_14: # %else20 +; RV64ZVE32F-NEXT: .LBB88_13: # %else20 ; RV64ZVE32F-NEXT: fsd fa0, 0(a0) ; RV64ZVE32F-NEXT: fsd fa1, 8(a0) ; RV64ZVE32F-NEXT: fsd fa2, 16(a0) @@ -10297,24 +10306,29 @@ ; RV64ZVE32F-NEXT: fsd fa6, 48(a0) ; RV64ZVE32F-NEXT: fsd fa7, 56(a0) ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB88_14: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa2, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: beqz a3, .LBB88_6 ; RV64ZVE32F-NEXT: .LBB88_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa3, 0(a3) ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB88_8 +; RV64ZVE32F-NEXT: beqz a3, .LBB88_7 ; RV64ZVE32F-NEXT: .LBB88_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa4, 0(a3) ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB88_9 -; RV64ZVE32F-NEXT: j .LBB88_10 +; RV64ZVE32F-NEXT: bnez a3, .LBB88_8 +; RV64ZVE32F-NEXT: j .LBB88_9 %eidxs = sext <8 x i8> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, ptr %base, <8 x i64> %eidxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) @@ -10461,57 +10475,49 @@ ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa1, 0(a3) ; RV64ZVE32F-NEXT: .LBB89_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a3, a2, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB89_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-NEXT: andi a3, a3, 255 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa2, 0(a3) -; RV64ZVE32F-NEXT: .LBB89_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB89_14 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a3, .LBB89_15 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB89_6: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 16 ; RV64ZVE32F-NEXT: bnez a3, .LBB89_16 -; RV64ZVE32F-NEXT: .LBB89_8: # %else11 +; RV64ZVE32F-NEXT: .LBB89_7: # %else11 ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: beqz a3, .LBB89_10 -; RV64ZVE32F-NEXT: .LBB89_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: beqz a3, .LBB89_9 +; RV64ZVE32F-NEXT: .LBB89_8: # %cond.load13 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: andi a3, a3, 255 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) -; RV64ZVE32F-NEXT: .LBB89_10: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: .LBB89_9: # %else14 ; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB89_12 -; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB89_11 +; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: andi a3, a3, 255 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa6, 0(a3) -; RV64ZVE32F-NEXT: .LBB89_12: # %else17 +; RV64ZVE32F-NEXT: .LBB89_11: # %else17 ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB89_14 -; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a2, .LBB89_13 +; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: fld fa7, 0(a1) -; RV64ZVE32F-NEXT: .LBB89_14: # %else20 +; RV64ZVE32F-NEXT: .LBB89_13: # %else20 ; RV64ZVE32F-NEXT: fsd fa0, 0(a0) ; RV64ZVE32F-NEXT: fsd fa1, 8(a0) ; RV64ZVE32F-NEXT: fsd fa2, 16(a0) @@ -10521,26 +10527,32 @@ ; RV64ZVE32F-NEXT: fsd fa6, 48(a0) ; RV64ZVE32F-NEXT: fsd fa7, 56(a0) ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB89_14: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: andi a3, a3, 255 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa2, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: beqz a3, .LBB89_6 ; RV64ZVE32F-NEXT: .LBB89_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: andi a3, a3, 255 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa3, 0(a3) ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB89_8 +; RV64ZVE32F-NEXT: beqz a3, .LBB89_7 ; RV64ZVE32F-NEXT: .LBB89_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: andi a3, a3, 255 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa4, 0(a3) ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB89_9 -; RV64ZVE32F-NEXT: j .LBB89_10 +; RV64ZVE32F-NEXT: bnez a3, .LBB89_8 +; RV64ZVE32F-NEXT: j .LBB89_9 %eidxs = zext <8 x i8> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, ptr %base, <8 x i64> %eidxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) @@ -10686,53 +10698,46 @@ ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa1, 0(a3) ; RV64ZVE32F-NEXT: .LBB90_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a3, a2, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB90_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa2, 0(a3) -; RV64ZVE32F-NEXT: .LBB90_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB90_14 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a3, .LBB90_15 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB90_6: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 16 ; RV64ZVE32F-NEXT: bnez a3, .LBB90_16 -; RV64ZVE32F-NEXT: .LBB90_8: # %else11 +; RV64ZVE32F-NEXT: .LBB90_7: # %else11 ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: beqz a3, .LBB90_10 -; RV64ZVE32F-NEXT: .LBB90_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: beqz a3, .LBB90_9 +; RV64ZVE32F-NEXT: .LBB90_8: # %cond.load13 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) -; RV64ZVE32F-NEXT: .LBB90_10: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: .LBB90_9: # %else14 ; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB90_12 -; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB90_11 +; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa6, 0(a3) -; RV64ZVE32F-NEXT: .LBB90_12: # %else17 +; RV64ZVE32F-NEXT: .LBB90_11: # %else17 ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB90_14 -; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a2, .LBB90_13 +; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: fld fa7, 0(a1) -; RV64ZVE32F-NEXT: .LBB90_14: # %else20 +; RV64ZVE32F-NEXT: .LBB90_13: # %else20 ; RV64ZVE32F-NEXT: fsd fa0, 0(a0) ; RV64ZVE32F-NEXT: fsd fa1, 8(a0) ; RV64ZVE32F-NEXT: fsd fa2, 16(a0) @@ -10742,24 +10747,29 @@ ; RV64ZVE32F-NEXT: fsd fa6, 48(a0) ; RV64ZVE32F-NEXT: fsd fa7, 56(a0) ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB90_14: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa2, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: beqz a3, .LBB90_6 ; RV64ZVE32F-NEXT: .LBB90_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa3, 0(a3) ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB90_8 +; RV64ZVE32F-NEXT: beqz a3, .LBB90_7 ; RV64ZVE32F-NEXT: .LBB90_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa4, 0(a3) ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB90_9 -; RV64ZVE32F-NEXT: j .LBB90_10 +; RV64ZVE32F-NEXT: bnez a3, .LBB90_8 +; RV64ZVE32F-NEXT: j .LBB90_9 %ptrs = getelementptr inbounds double, ptr %base, <8 x i16> %idxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) ret <8 x double> %v @@ -10904,53 +10914,46 @@ ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa1, 0(a3) ; RV64ZVE32F-NEXT: .LBB91_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a3, a2, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB91_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a1, a3 -; RV64ZVE32F-NEXT: fld fa2, 0(a3) -; RV64ZVE32F-NEXT: .LBB91_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB91_14 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a3, .LBB91_15 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB91_6: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 16 ; RV64ZVE32F-NEXT: bnez a3, .LBB91_16 -; RV64ZVE32F-NEXT: .LBB91_8: # %else11 +; RV64ZVE32F-NEXT: .LBB91_7: # %else11 ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: beqz a3, .LBB91_10 -; RV64ZVE32F-NEXT: .LBB91_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: beqz a3, .LBB91_9 +; RV64ZVE32F-NEXT: .LBB91_8: # %cond.load13 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa5, 0(a3) -; RV64ZVE32F-NEXT: .LBB91_10: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: .LBB91_9: # %else14 ; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB91_12 -; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: beqz a3, .LBB91_11 +; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa6, 0(a3) -; RV64ZVE32F-NEXT: .LBB91_12: # %else17 +; RV64ZVE32F-NEXT: .LBB91_11: # %else17 ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB91_14 -; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_13 +; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: fld fa7, 0(a1) -; RV64ZVE32F-NEXT: .LBB91_14: # %else20 +; RV64ZVE32F-NEXT: .LBB91_13: # %else20 ; RV64ZVE32F-NEXT: fsd fa0, 0(a0) ; RV64ZVE32F-NEXT: fsd fa1, 8(a0) ; RV64ZVE32F-NEXT: fsd fa2, 16(a0) @@ -10960,24 +10963,29 @@ ; RV64ZVE32F-NEXT: fsd fa6, 48(a0) ; RV64ZVE32F-NEXT: fsd fa7, 56(a0) ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB91_14: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a1, a3 +; RV64ZVE32F-NEXT: fld fa2, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: beqz a3, .LBB91_6 ; RV64ZVE32F-NEXT: .LBB91_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa3, 0(a3) ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB91_8 +; RV64ZVE32F-NEXT: beqz a3, .LBB91_7 ; RV64ZVE32F-NEXT: .LBB91_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a1, a3 ; RV64ZVE32F-NEXT: fld fa4, 0(a3) ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB91_9 -; RV64ZVE32F-NEXT: j .LBB91_10 +; RV64ZVE32F-NEXT: bnez a3, .LBB91_8 +; RV64ZVE32F-NEXT: j .LBB91_9 %eidxs = sext <8 x i16> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, ptr %base, <8 x i64> %eidxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) @@ -11127,57 +11135,49 @@ ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: fld fa1, 0(a4) ; RV64ZVE32F-NEXT: .LBB92_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a4, a3, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB92_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a4, v9 -; RV64ZVE32F-NEXT: and a4, a4, a2 -; RV64ZVE32F-NEXT: slli a4, a4, 3 -; RV64ZVE32F-NEXT: add a4, a1, a4 -; RV64ZVE32F-NEXT: fld fa2, 0(a4) -; RV64ZVE32F-NEXT: .LBB92_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a4, .LBB92_14 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a4, a3, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a4, .LBB92_15 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB92_6: # %else8 ; RV64ZVE32F-NEXT: andi a4, a3, 16 -; RV64ZVE32F-NEXT: bnez a4, .LBB92_16 -; RV64ZVE32F-NEXT: .LBB92_8: # %else11 -; RV64ZVE32F-NEXT: andi a4, a3, 32 -; RV64ZVE32F-NEXT: beqz a4, .LBB92_10 -; RV64ZVE32F-NEXT: .LBB92_9: # %cond.load13 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a4, v9 +; RV64ZVE32F-NEXT: bnez a4, .LBB92_16 +; RV64ZVE32F-NEXT: .LBB92_7: # %else11 +; RV64ZVE32F-NEXT: andi a4, a3, 32 +; RV64ZVE32F-NEXT: beqz a4, .LBB92_9 +; RV64ZVE32F-NEXT: .LBB92_8: # %cond.load13 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a4, v8 ; RV64ZVE32F-NEXT: and a4, a4, a2 ; RV64ZVE32F-NEXT: slli a4, a4, 3 ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: fld fa5, 0(a4) -; RV64ZVE32F-NEXT: .LBB92_10: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: .LBB92_9: # %else14 ; RV64ZVE32F-NEXT: andi a4, a3, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: beqz a4, .LBB92_12 -; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: beqz a4, .LBB92_11 +; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a4, v8 ; RV64ZVE32F-NEXT: and a4, a4, a2 ; RV64ZVE32F-NEXT: slli a4, a4, 3 ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: fld fa6, 0(a4) -; RV64ZVE32F-NEXT: .LBB92_12: # %else17 +; RV64ZVE32F-NEXT: .LBB92_11: # %else17 ; RV64ZVE32F-NEXT: andi a3, a3, -128 -; RV64ZVE32F-NEXT: beqz a3, .LBB92_14 -; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19 +; RV64ZVE32F-NEXT: beqz a3, .LBB92_13 +; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: and a2, a3, a2 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a1, a1, a2 ; RV64ZVE32F-NEXT: fld fa7, 0(a1) -; RV64ZVE32F-NEXT: .LBB92_14: # %else20 +; RV64ZVE32F-NEXT: .LBB92_13: # %else20 ; RV64ZVE32F-NEXT: fsd fa0, 0(a0) ; RV64ZVE32F-NEXT: fsd fa1, 8(a0) ; RV64ZVE32F-NEXT: fsd fa2, 16(a0) @@ -11187,26 +11187,32 @@ ; RV64ZVE32F-NEXT: fsd fa6, 48(a0) ; RV64ZVE32F-NEXT: fsd fa7, 56(a0) ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB92_14: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a4, v8 +; RV64ZVE32F-NEXT: and a4, a4, a2 +; RV64ZVE32F-NEXT: slli a4, a4, 3 +; RV64ZVE32F-NEXT: add a4, a1, a4 +; RV64ZVE32F-NEXT: fld fa2, 0(a4) +; RV64ZVE32F-NEXT: andi a4, a3, 8 +; RV64ZVE32F-NEXT: beqz a4, .LBB92_6 ; RV64ZVE32F-NEXT: .LBB92_15: # %cond.load7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a4, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a4, v8 ; RV64ZVE32F-NEXT: and a4, a4, a2 ; RV64ZVE32F-NEXT: slli a4, a4, 3 ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: fld fa3, 0(a4) ; RV64ZVE32F-NEXT: andi a4, a3, 16 -; RV64ZVE32F-NEXT: beqz a4, .LBB92_8 +; RV64ZVE32F-NEXT: beqz a4, .LBB92_7 ; RV64ZVE32F-NEXT: .LBB92_16: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a4, v8 +; RV64ZVE32F-NEXT: vmv.x.s a4, v9 ; RV64ZVE32F-NEXT: and a4, a4, a2 ; RV64ZVE32F-NEXT: slli a4, a4, 3 ; RV64ZVE32F-NEXT: add a4, a1, a4 ; RV64ZVE32F-NEXT: fld fa4, 0(a4) ; RV64ZVE32F-NEXT: andi a4, a3, 32 -; RV64ZVE32F-NEXT: bnez a4, .LBB92_9 -; RV64ZVE32F-NEXT: j .LBB92_10 +; RV64ZVE32F-NEXT: bnez a4, .LBB92_8 +; RV64ZVE32F-NEXT: j .LBB92_9 %eidxs = zext <8 x i16> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, ptr %base, <8 x i64> %eidxs %v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru) @@ -12139,29 +12145,31 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1 ; RV64ZVE32F-NEXT: .LBB97_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB97_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 2 -; RV64ZVE32F-NEXT: .LBB97_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: bnez a2, .LBB97_25 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB97_26 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 +; RV64ZVE32F-NEXT: .LBB97_6: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB97_27 +; RV64ZVE32F-NEXT: beqz a2, .LBB97_8 +; RV64ZVE32F-NEXT: .LBB97_7: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 4 ; RV64ZVE32F-NEXT: .LBB97_8: # %else11 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB97_10 -; RV64ZVE32F-NEXT: .LBB97_9: # %cond.load13 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 @@ -12171,18 +12179,16 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 6, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 5 ; RV64ZVE32F-NEXT: .LBB97_10: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 8 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB97_28 +; RV64ZVE32F-NEXT: bnez a2, .LBB97_27 ; RV64ZVE32F-NEXT: # %bb.11: # %else17 ; RV64ZVE32F-NEXT: andi a2, a1, 128 -; RV64ZVE32F-NEXT: bnez a2, .LBB97_29 +; RV64ZVE32F-NEXT: bnez a2, .LBB97_28 ; RV64ZVE32F-NEXT: .LBB97_12: # %else20 ; RV64ZVE32F-NEXT: andi a2, a1, 256 -; RV64ZVE32F-NEXT: bnez a2, .LBB97_30 +; RV64ZVE32F-NEXT: bnez a2, .LBB97_29 ; RV64ZVE32F-NEXT: .LBB97_13: # %else23 ; RV64ZVE32F-NEXT: andi a2, a1, 512 ; RV64ZVE32F-NEXT: beqz a2, .LBB97_15 @@ -12196,54 +12202,47 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 10, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 9 ; RV64ZVE32F-NEXT: .LBB97_15: # %else26 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 1024 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB97_17 -; RV64ZVE32F-NEXT: # %bb.16: # %cond.load28 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 11, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 10 -; RV64ZVE32F-NEXT: .LBB97_17: # %else29 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB97_30 +; RV64ZVE32F-NEXT: # %bb.16: # %else29 ; RV64ZVE32F-NEXT: slli a2, a1, 52 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bltz a2, .LBB97_31 -; RV64ZVE32F-NEXT: # %bb.18: # %else32 +; RV64ZVE32F-NEXT: .LBB97_17: # %else32 ; RV64ZVE32F-NEXT: slli a2, a1, 51 ; RV64ZVE32F-NEXT: bltz a2, .LBB97_32 -; RV64ZVE32F-NEXT: .LBB97_19: # %else35 +; RV64ZVE32F-NEXT: .LBB97_18: # %else35 ; RV64ZVE32F-NEXT: slli a2, a1, 50 -; RV64ZVE32F-NEXT: bgez a2, .LBB97_21 -; RV64ZVE32F-NEXT: .LBB97_20: # %cond.load37 +; RV64ZVE32F-NEXT: bgez a2, .LBB97_20 +; RV64ZVE32F-NEXT: .LBB97_19: # %cond.load37 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 14, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 13 -; RV64ZVE32F-NEXT: .LBB97_21: # %else38 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 13 +; RV64ZVE32F-NEXT: .LBB97_20: # %else38 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: slli a2, a1, 49 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bgez a2, .LBB97_23 -; RV64ZVE32F-NEXT: # %bb.22: # %cond.load40 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2 +; RV64ZVE32F-NEXT: bgez a2, .LBB97_22 +; RV64ZVE32F-NEXT: # %bb.21: # %cond.load40 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) ; RV64ZVE32F-NEXT: vmv.s.x v10, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 15, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 14 -; RV64ZVE32F-NEXT: .LBB97_23: # %else41 +; RV64ZVE32F-NEXT: .LBB97_22: # %else41 ; RV64ZVE32F-NEXT: lui a2, 1048568 ; RV64ZVE32F-NEXT: and a1, a1, a2 -; RV64ZVE32F-NEXT: beqz a1, .LBB97_25 -; RV64ZVE32F-NEXT: # %bb.24: # %cond.load43 +; RV64ZVE32F-NEXT: beqz a1, .LBB97_24 +; RV64ZVE32F-NEXT: # %bb.23: # %cond.load43 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -12252,9 +12251,18 @@ ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 15 -; RV64ZVE32F-NEXT: .LBB97_25: # %else44 +; RV64ZVE32F-NEXT: .LBB97_24: # %else44 ; RV64ZVE32F-NEXT: vmv1r.v v8, v9 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB97_25: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v12, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB97_6 ; RV64ZVE32F-NEXT: .LBB97_26: # %cond.load7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 @@ -12265,18 +12273,9 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB97_8 -; RV64ZVE32F-NEXT: .LBB97_27: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v11, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB97_9 -; RV64ZVE32F-NEXT: j .LBB97_10 -; RV64ZVE32F-NEXT: .LBB97_28: # %cond.load16 +; RV64ZVE32F-NEXT: bnez a2, .LBB97_7 +; RV64ZVE32F-NEXT: j .LBB97_8 +; RV64ZVE32F-NEXT: .LBB97_27: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) @@ -12285,7 +12284,7 @@ ; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 6 ; RV64ZVE32F-NEXT: andi a2, a1, 128 ; RV64ZVE32F-NEXT: beqz a2, .LBB97_12 -; RV64ZVE32F-NEXT: .LBB97_29: # %cond.load19 +; RV64ZVE32F-NEXT: .LBB97_28: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 @@ -12296,7 +12295,7 @@ ; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 7 ; RV64ZVE32F-NEXT: andi a2, a1, 256 ; RV64ZVE32F-NEXT: beqz a2, .LBB97_13 -; RV64ZVE32F-NEXT: .LBB97_30: # %cond.load22 +; RV64ZVE32F-NEXT: .LBB97_29: # %cond.load22 ; RV64ZVE32F-NEXT: vsetivli zero, 9, e8, m1, tu, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -12306,27 +12305,36 @@ ; RV64ZVE32F-NEXT: andi a2, a1, 512 ; RV64ZVE32F-NEXT: bnez a2, .LBB97_14 ; RV64ZVE32F-NEXT: j .LBB97_15 +; RV64ZVE32F-NEXT: .LBB97_30: # %cond.load28 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: vmv.s.x v11, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 11, e8, m1, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 10 +; RV64ZVE32F-NEXT: slli a2, a1, 52 +; RV64ZVE32F-NEXT: bgez a2, .LBB97_17 ; RV64ZVE32F-NEXT: .LBB97_31: # %cond.load31 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 12, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 11 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 11 ; RV64ZVE32F-NEXT: slli a2, a1, 51 -; RV64ZVE32F-NEXT: bgez a2, .LBB97_19 +; RV64ZVE32F-NEXT: bgez a2, .LBB97_18 ; RV64ZVE32F-NEXT: .LBB97_32: # %cond.load34 ; RV64ZVE32F-NEXT: vsetivli zero, 13, e8, m1, tu, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: vmv.s.x v10, a2 -; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 12 +; RV64ZVE32F-NEXT: vmv.s.x v8, a2 +; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 12 ; RV64ZVE32F-NEXT: slli a2, a1, 50 -; RV64ZVE32F-NEXT: bltz a2, .LBB97_20 -; RV64ZVE32F-NEXT: j .LBB97_21 +; RV64ZVE32F-NEXT: bltz a2, .LBB97_19 +; RV64ZVE32F-NEXT: j .LBB97_20 %ptrs = getelementptr inbounds i8, ptr %base, <16 x i8> %idxs %v = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %ptrs, i32 2, <16 x i1> %m, <16 x i8> %passthru) ret <16 x i8> %v @@ -12395,33 +12403,37 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1 ; RV64ZVE32F-NEXT: .LBB98_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: bnez a2, .LBB98_49 +; RV64ZVE32F-NEXT: # %bb.5: # %else5 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: bnez a2, .LBB98_50 +; RV64ZVE32F-NEXT: .LBB98_6: # %else8 +; RV64ZVE32F-NEXT: andi a2, a1, 16 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_8 +; RV64ZVE32F-NEXT: .LBB98_7: # %cond.load10 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.s.x v14, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2 -; RV64ZVE32F-NEXT: .LBB98_6: # %else5 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 4 -; RV64ZVE32F-NEXT: bnez a2, .LBB98_50 -; RV64ZVE32F-NEXT: # %bb.7: # %else8 -; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB98_51 +; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 4 ; RV64ZVE32F-NEXT: .LBB98_8: # %else11 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB98_10 -; RV64ZVE32F-NEXT: .LBB98_9: # %cond.load13 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.load13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v13, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vslidedown.vi v14, v13, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v14 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 @@ -12429,18 +12441,16 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 6, e8, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 5 ; RV64ZVE32F-NEXT: .LBB98_10: # %else14 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 8 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB98_52 +; RV64ZVE32F-NEXT: bnez a2, .LBB98_51 ; RV64ZVE32F-NEXT: # %bb.11: # %else17 ; RV64ZVE32F-NEXT: andi a2, a1, 128 -; RV64ZVE32F-NEXT: bnez a2, .LBB98_53 +; RV64ZVE32F-NEXT: bnez a2, .LBB98_52 ; RV64ZVE32F-NEXT: .LBB98_12: # %else20 ; RV64ZVE32F-NEXT: andi a2, a1, 256 -; RV64ZVE32F-NEXT: bnez a2, .LBB98_54 +; RV64ZVE32F-NEXT: bnez a2, .LBB98_53 ; RV64ZVE32F-NEXT: .LBB98_13: # %else23 ; RV64ZVE32F-NEXT: andi a2, a1, 512 ; RV64ZVE32F-NEXT: beqz a2, .LBB98_15 @@ -12455,12 +12465,14 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 10, e8, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 9 ; RV64ZVE32F-NEXT: .LBB98_15: # %else26 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 1024 -; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB98_17 ; RV64ZVE32F-NEXT: # %bb.16: # %cond.load28 -; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 @@ -12468,14 +12480,12 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 11, e8, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 10 ; RV64ZVE32F-NEXT: .LBB98_17: # %else29 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: slli a2, a1, 52 -; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 4 ; RV64ZVE32F-NEXT: bgez a2, .LBB98_19 ; RV64ZVE32F-NEXT: # %bb.18: # %cond.load31 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 @@ -12488,7 +12498,7 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 16 ; RV64ZVE32F-NEXT: bgez a2, .LBB98_21 ; RV64ZVE32F-NEXT: # %bb.20: # %cond.load34 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 @@ -12500,7 +12510,7 @@ ; RV64ZVE32F-NEXT: bgez a2, .LBB98_23 ; RV64ZVE32F-NEXT: # %bb.22: # %cond.load37 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v12, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v13, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) @@ -12511,14 +12521,14 @@ ; RV64ZVE32F-NEXT: .LBB98_23: # %else38 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: slli a2, a1, 49 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v12, 2 -; RV64ZVE32F-NEXT: bltz a2, .LBB98_55 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v13, 2 +; RV64ZVE32F-NEXT: bltz a2, .LBB98_54 ; RV64ZVE32F-NEXT: # %bb.24: # %else41 ; RV64ZVE32F-NEXT: slli a2, a1, 48 -; RV64ZVE32F-NEXT: bltz a2, .LBB98_56 +; RV64ZVE32F-NEXT: bltz a2, .LBB98_55 ; RV64ZVE32F-NEXT: .LBB98_25: # %else44 ; RV64ZVE32F-NEXT: slli a2, a1, 47 -; RV64ZVE32F-NEXT: bltz a2, .LBB98_57 +; RV64ZVE32F-NEXT: bltz a2, .LBB98_56 ; RV64ZVE32F-NEXT: .LBB98_26: # %else47 ; RV64ZVE32F-NEXT: slli a2, a1, 46 ; RV64ZVE32F-NEXT: bgez a2, .LBB98_28 @@ -12533,30 +12543,34 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 18, e8, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 17 ; RV64ZVE32F-NEXT: .LBB98_28: # %else50 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: slli a2, a1, 45 ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: bgez a2, .LBB98_30 -; RV64ZVE32F-NEXT: # %bb.29: # %cond.load52 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vmv.s.x v14, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 19, e8, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 18 -; RV64ZVE32F-NEXT: .LBB98_30: # %else53 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: bltz a2, .LBB98_57 +; RV64ZVE32F-NEXT: # %bb.29: # %else53 ; RV64ZVE32F-NEXT: slli a2, a1, 44 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: bltz a2, .LBB98_58 -; RV64ZVE32F-NEXT: # %bb.31: # %else56 +; RV64ZVE32F-NEXT: .LBB98_30: # %else56 ; RV64ZVE32F-NEXT: slli a2, a1, 43 -; RV64ZVE32F-NEXT: bltz a2, .LBB98_59 +; RV64ZVE32F-NEXT: bgez a2, .LBB98_32 +; RV64ZVE32F-NEXT: .LBB98_31: # %cond.load58 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 21, e8, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 20 ; RV64ZVE32F-NEXT: .LBB98_32: # %else59 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; RV64ZVE32F-NEXT: slli a2, a1, 42 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 8 ; RV64ZVE32F-NEXT: bgez a2, .LBB98_34 -; RV64ZVE32F-NEXT: .LBB98_33: # %cond.load61 +; RV64ZVE32F-NEXT: # %bb.33: # %cond.load61 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v12 @@ -12567,18 +12581,16 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 22, e8, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 21 ; RV64ZVE32F-NEXT: .LBB98_34: # %else62 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 8 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: slli a2, a1, 41 ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 -; RV64ZVE32F-NEXT: bltz a2, .LBB98_60 +; RV64ZVE32F-NEXT: bltz a2, .LBB98_59 ; RV64ZVE32F-NEXT: # %bb.35: # %else65 ; RV64ZVE32F-NEXT: slli a2, a1, 40 -; RV64ZVE32F-NEXT: bltz a2, .LBB98_61 +; RV64ZVE32F-NEXT: bltz a2, .LBB98_60 ; RV64ZVE32F-NEXT: .LBB98_36: # %else68 ; RV64ZVE32F-NEXT: slli a2, a1, 39 -; RV64ZVE32F-NEXT: bltz a2, .LBB98_62 +; RV64ZVE32F-NEXT: bltz a2, .LBB98_61 ; RV64ZVE32F-NEXT: .LBB98_37: # %else71 ; RV64ZVE32F-NEXT: slli a2, a1, 38 ; RV64ZVE32F-NEXT: bgez a2, .LBB98_39 @@ -12593,45 +12605,37 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 26, e8, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 25 ; RV64ZVE32F-NEXT: .LBB98_39: # %else74 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: slli a2, a1, 37 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: bgez a2, .LBB98_41 -; RV64ZVE32F-NEXT: # %bb.40: # %cond.load76 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 27, e8, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 26 -; RV64ZVE32F-NEXT: .LBB98_41: # %else77 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bltz a2, .LBB98_62 +; RV64ZVE32F-NEXT: # %bb.40: # %else77 ; RV64ZVE32F-NEXT: slli a2, a1, 36 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bltz a2, .LBB98_63 -; RV64ZVE32F-NEXT: # %bb.42: # %else80 +; RV64ZVE32F-NEXT: .LBB98_41: # %else80 ; RV64ZVE32F-NEXT: slli a2, a1, 35 ; RV64ZVE32F-NEXT: bltz a2, .LBB98_64 -; RV64ZVE32F-NEXT: .LBB98_43: # %else83 +; RV64ZVE32F-NEXT: .LBB98_42: # %else83 ; RV64ZVE32F-NEXT: slli a2, a1, 34 -; RV64ZVE32F-NEXT: bgez a2, .LBB98_45 -; RV64ZVE32F-NEXT: .LBB98_44: # %cond.load85 +; RV64ZVE32F-NEXT: bgez a2, .LBB98_44 +; RV64ZVE32F-NEXT: .LBB98_43: # %cond.load85 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 30, e8, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 29 -; RV64ZVE32F-NEXT: .LBB98_45: # %else86 +; RV64ZVE32F-NEXT: .LBB98_44: # %else86 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: slli a2, a1, 33 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32F-NEXT: bgez a2, .LBB98_47 -; RV64ZVE32F-NEXT: # %bb.46: # %cond.load88 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 +; RV64ZVE32F-NEXT: bgez a2, .LBB98_46 +; RV64ZVE32F-NEXT: # %bb.45: # %cond.load88 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) @@ -12639,11 +12643,11 @@ ; RV64ZVE32F-NEXT: vmv.s.x v12, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 31, e8, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 30 -; RV64ZVE32F-NEXT: .LBB98_47: # %else89 +; RV64ZVE32F-NEXT: .LBB98_46: # %else89 ; RV64ZVE32F-NEXT: lui a2, 524288 ; RV64ZVE32F-NEXT: and a1, a1, a2 -; RV64ZVE32F-NEXT: beqz a1, .LBB98_49 -; RV64ZVE32F-NEXT: # %bb.48: # %cond.load91 +; RV64ZVE32F-NEXT: beqz a1, .LBB98_48 +; RV64ZVE32F-NEXT: # %bb.47: # %cond.load91 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -12653,9 +12657,19 @@ ; RV64ZVE32F-NEXT: vmv.s.x v8, a0 ; RV64ZVE32F-NEXT: vsetvli zero, a1, e8, m2, ta, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 31 -; RV64ZVE32F-NEXT: .LBB98_49: # %else92 +; RV64ZVE32F-NEXT: .LBB98_48: # %else92 ; RV64ZVE32F-NEXT: vmv2r.v v8, v10 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB98_49: # %cond.load4 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2 +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB98_6 ; RV64ZVE32F-NEXT: .LBB98_50: # %cond.load7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1 @@ -12667,21 +12681,9 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 3 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB98_8 -; RV64ZVE32F-NEXT: .LBB98_51: # %cond.load10 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v13 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v14, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 4 -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB98_9 -; RV64ZVE32F-NEXT: j .LBB98_10 -; RV64ZVE32F-NEXT: .LBB98_52: # %cond.load16 +; RV64ZVE32F-NEXT: bnez a2, .LBB98_7 +; RV64ZVE32F-NEXT: j .LBB98_8 +; RV64ZVE32F-NEXT: .LBB98_51: # %cond.load16 ; RV64ZVE32F-NEXT: vmv.x.s a2, v13 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) @@ -12691,7 +12693,7 @@ ; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 6 ; RV64ZVE32F-NEXT: andi a2, a1, 128 ; RV64ZVE32F-NEXT: beqz a2, .LBB98_12 -; RV64ZVE32F-NEXT: .LBB98_53: # %cond.load19 +; RV64ZVE32F-NEXT: .LBB98_52: # %cond.load19 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v13 @@ -12703,7 +12705,7 @@ ; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 7 ; RV64ZVE32F-NEXT: andi a2, a1, 256 ; RV64ZVE32F-NEXT: beqz a2, .LBB98_13 -; RV64ZVE32F-NEXT: .LBB98_54: # %cond.load22 +; RV64ZVE32F-NEXT: .LBB98_53: # %cond.load22 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -12716,7 +12718,7 @@ ; RV64ZVE32F-NEXT: andi a2, a1, 512 ; RV64ZVE32F-NEXT: bnez a2, .LBB98_14 ; RV64ZVE32F-NEXT: j .LBB98_15 -; RV64ZVE32F-NEXT: .LBB98_55: # %cond.load40 +; RV64ZVE32F-NEXT: .LBB98_54: # %cond.load40 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) @@ -12726,7 +12728,7 @@ ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 14 ; RV64ZVE32F-NEXT: slli a2, a1, 48 ; RV64ZVE32F-NEXT: bgez a2, .LBB98_25 -; RV64ZVE32F-NEXT: .LBB98_56: # %cond.load43 +; RV64ZVE32F-NEXT: .LBB98_55: # %cond.load43 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 @@ -12738,7 +12740,7 @@ ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 15 ; RV64ZVE32F-NEXT: slli a2, a1, 47 ; RV64ZVE32F-NEXT: bgez a2, .LBB98_26 -; RV64ZVE32F-NEXT: .LBB98_57: # %cond.load46 +; RV64ZVE32F-NEXT: .LBB98_56: # %cond.load46 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -12751,6 +12753,16 @@ ; RV64ZVE32F-NEXT: slli a2, a1, 46 ; RV64ZVE32F-NEXT: bltz a2, .LBB98_27 ; RV64ZVE32F-NEXT: j .LBB98_28 +; RV64ZVE32F-NEXT: .LBB98_57: # %cond.load52 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vmv.s.x v14, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 19, e8, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 18 +; RV64ZVE32F-NEXT: slli a2, a1, 44 +; RV64ZVE32F-NEXT: bgez a2, .LBB98_30 ; RV64ZVE32F-NEXT: .LBB98_58: # %cond.load55 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1 @@ -12762,21 +12774,9 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 20, e8, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 19 ; RV64ZVE32F-NEXT: slli a2, a1, 43 -; RV64ZVE32F-NEXT: bgez a2, .LBB98_32 -; RV64ZVE32F-NEXT: .LBB98_59: # %cond.load58 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: lbu a2, 0(a2) -; RV64ZVE32F-NEXT: li a3, 32 -; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.s.x v12, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 21, e8, m2, tu, ma -; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 20 -; RV64ZVE32F-NEXT: slli a2, a1, 42 -; RV64ZVE32F-NEXT: bltz a2, .LBB98_33 -; RV64ZVE32F-NEXT: j .LBB98_34 -; RV64ZVE32F-NEXT: .LBB98_60: # %cond.load64 +; RV64ZVE32F-NEXT: bltz a2, .LBB98_31 +; RV64ZVE32F-NEXT: j .LBB98_32 +; RV64ZVE32F-NEXT: .LBB98_59: # %cond.load64 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) @@ -12786,7 +12786,7 @@ ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 22 ; RV64ZVE32F-NEXT: slli a2, a1, 40 ; RV64ZVE32F-NEXT: bgez a2, .LBB98_36 -; RV64ZVE32F-NEXT: .LBB98_61: # %cond.load67 +; RV64ZVE32F-NEXT: .LBB98_60: # %cond.load67 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 @@ -12798,7 +12798,7 @@ ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 23 ; RV64ZVE32F-NEXT: slli a2, a1, 39 ; RV64ZVE32F-NEXT: bgez a2, .LBB98_37 -; RV64ZVE32F-NEXT: .LBB98_62: # %cond.load70 +; RV64ZVE32F-NEXT: .LBB98_61: # %cond.load70 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -12811,10 +12811,20 @@ ; RV64ZVE32F-NEXT: slli a2, a1, 38 ; RV64ZVE32F-NEXT: bltz a2, .LBB98_38 ; RV64ZVE32F-NEXT: j .LBB98_39 +; RV64ZVE32F-NEXT: .LBB98_62: # %cond.load76 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: lbu a2, 0(a2) +; RV64ZVE32F-NEXT: li a3, 32 +; RV64ZVE32F-NEXT: vmv.s.x v12, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 27, e8, m2, tu, ma +; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 26 +; RV64ZVE32F-NEXT: slli a2, a1, 36 +; RV64ZVE32F-NEXT: bgez a2, .LBB98_41 ; RV64ZVE32F-NEXT: .LBB98_63: # %cond.load79 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 @@ -12822,10 +12832,10 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 28, e8, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 27 ; RV64ZVE32F-NEXT: slli a2, a1, 35 -; RV64ZVE32F-NEXT: bgez a2, .LBB98_43 +; RV64ZVE32F-NEXT: bgez a2, .LBB98_42 ; RV64ZVE32F-NEXT: .LBB98_64: # %cond.load82 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: lbu a2, 0(a2) ; RV64ZVE32F-NEXT: li a3, 32 @@ -12834,8 +12844,8 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 29, e8, m2, tu, ma ; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 28 ; RV64ZVE32F-NEXT: slli a2, a1, 34 -; RV64ZVE32F-NEXT: bltz a2, .LBB98_44 -; RV64ZVE32F-NEXT: j .LBB98_45 +; RV64ZVE32F-NEXT: bltz a2, .LBB98_43 +; RV64ZVE32F-NEXT: j .LBB98_44 %ptrs = getelementptr inbounds i8, ptr %base, <32 x i8> %idxs %v = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %ptrs, i32 2, <32 x i1> %m, <32 x i8> %passthru) ret <32 x i8> %v diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll @@ -532,64 +532,66 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vse8.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB9_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB9_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 -; RV64ZVE32F-NEXT: vse8.v v11, (a2) -; RV64ZVE32F-NEXT: .LBB9_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB9_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB9_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB9_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB9_14 -; RV64ZVE32F-NEXT: .LBB9_8: # %else8 +; RV64ZVE32F-NEXT: .LBB9_7: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB9_10 -; RV64ZVE32F-NEXT: .LBB9_9: # %cond.store9 +; RV64ZVE32F-NEXT: beqz a2, .LBB9_9 +; RV64ZVE32F-NEXT: .LBB9_8: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 -; RV64ZVE32F-NEXT: vse8.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB9_10: # %else10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 +; RV64ZVE32F-NEXT: vse8.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB9_9: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB9_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB9_16 -; RV64ZVE32F-NEXT: .LBB9_12: # %else14 +; RV64ZVE32F-NEXT: .LBB9_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB9_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: vse8.v v11, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB9_6 ; RV64ZVE32F-NEXT: .LBB9_13: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse8.v v9, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB9_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB9_7 ; RV64ZVE32F-NEXT: .LBB9_14: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: vse8.v v9, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB9_9 -; RV64ZVE32F-NEXT: j .LBB9_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB9_8 +; RV64ZVE32F-NEXT: j .LBB9_9 ; RV64ZVE32F-NEXT: .LBB9_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -597,7 +599,7 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-NEXT: vse8.v v10, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB9_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB9_11 ; RV64ZVE32F-NEXT: .LBB9_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 @@ -1081,69 +1083,71 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB18_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB18_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 -; RV64ZVE32F-NEXT: vse16.v v11, (a2) -; RV64ZVE32F-NEXT: .LBB18_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB18_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB18_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB18_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB18_14 -; RV64ZVE32F-NEXT: .LBB18_8: # %else8 +; RV64ZVE32F-NEXT: .LBB18_7: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB18_10 -; RV64ZVE32F-NEXT: .LBB18_9: # %cond.store9 +; RV64ZVE32F-NEXT: beqz a2, .LBB18_9 +; RV64ZVE32F-NEXT: .LBB18_8: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB18_10: # %else10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB18_9: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB18_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB18_16 -; RV64ZVE32F-NEXT: .LBB18_12: # %else14 +; RV64ZVE32F-NEXT: .LBB18_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB18_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB18_6 ; RV64ZVE32F-NEXT: .LBB18_13: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB18_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB18_7 ; RV64ZVE32F-NEXT: .LBB18_14: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB18_9 -; RV64ZVE32F-NEXT: j .LBB18_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB18_8 +; RV64ZVE32F-NEXT: j .LBB18_9 ; RV64ZVE32F-NEXT: .LBB18_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 @@ -1152,7 +1156,7 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB18_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB18_11 ; RV64ZVE32F-NEXT: .LBB18_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 @@ -1212,69 +1216,71 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB19_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB19_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 -; RV64ZVE32F-NEXT: vse16.v v11, (a2) -; RV64ZVE32F-NEXT: .LBB19_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB19_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB19_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB19_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB19_14 -; RV64ZVE32F-NEXT: .LBB19_8: # %else8 +; RV64ZVE32F-NEXT: .LBB19_7: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB19_10 -; RV64ZVE32F-NEXT: .LBB19_9: # %cond.store9 +; RV64ZVE32F-NEXT: beqz a2, .LBB19_9 +; RV64ZVE32F-NEXT: .LBB19_8: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB19_10: # %else10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB19_9: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB19_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB19_16 -; RV64ZVE32F-NEXT: .LBB19_12: # %else14 +; RV64ZVE32F-NEXT: .LBB19_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB19_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB19_6 ; RV64ZVE32F-NEXT: .LBB19_13: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB19_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB19_7 ; RV64ZVE32F-NEXT: .LBB19_14: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB19_9 -; RV64ZVE32F-NEXT: j .LBB19_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB19_8 +; RV64ZVE32F-NEXT: j .LBB19_9 ; RV64ZVE32F-NEXT: .LBB19_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 @@ -1283,7 +1289,7 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB19_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB19_11 ; RV64ZVE32F-NEXT: .LBB19_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 @@ -1346,73 +1352,75 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB20_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB20_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: andi a2, a2, 255 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 -; RV64ZVE32F-NEXT: vse16.v v11, (a2) -; RV64ZVE32F-NEXT: .LBB20_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB20_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB20_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB20_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB20_14 -; RV64ZVE32F-NEXT: .LBB20_8: # %else8 +; RV64ZVE32F-NEXT: .LBB20_7: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB20_10 -; RV64ZVE32F-NEXT: .LBB20_9: # %cond.store9 +; RV64ZVE32F-NEXT: beqz a2, .LBB20_9 +; RV64ZVE32F-NEXT: .LBB20_8: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB20_10: # %else10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB20_9: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB20_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB20_16 -; RV64ZVE32F-NEXT: .LBB20_12: # %else14 +; RV64ZVE32F-NEXT: .LBB20_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB20_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB20_6 ; RV64ZVE32F-NEXT: .LBB20_13: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB20_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB20_7 ; RV64ZVE32F-NEXT: .LBB20_14: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB20_9 -; RV64ZVE32F-NEXT: j .LBB20_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB20_8 +; RV64ZVE32F-NEXT: j .LBB20_9 ; RV64ZVE32F-NEXT: .LBB20_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: andi a2, a2, 255 @@ -1422,7 +1430,7 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB20_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB20_11 ; RV64ZVE32F-NEXT: .LBB20_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 @@ -1483,68 +1491,70 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB21_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB21_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 -; RV64ZVE32F-NEXT: vse16.v v11, (a2) -; RV64ZVE32F-NEXT: .LBB21_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB21_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB21_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB21_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB21_14 -; RV64ZVE32F-NEXT: .LBB21_8: # %else8 +; RV64ZVE32F-NEXT: .LBB21_7: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB21_10 -; RV64ZVE32F-NEXT: .LBB21_9: # %cond.store9 +; RV64ZVE32F-NEXT: beqz a2, .LBB21_9 +; RV64ZVE32F-NEXT: .LBB21_8: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB21_10: # %else10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB21_9: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB21_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB21_16 -; RV64ZVE32F-NEXT: .LBB21_12: # %else14 +; RV64ZVE32F-NEXT: .LBB21_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB21_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB21_6 ; RV64ZVE32F-NEXT: .LBB21_13: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB21_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB21_7 ; RV64ZVE32F-NEXT: .LBB21_14: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB21_9 -; RV64ZVE32F-NEXT: j .LBB21_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB21_8 +; RV64ZVE32F-NEXT: j .LBB21_9 ; RV64ZVE32F-NEXT: .LBB21_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 @@ -1553,7 +1563,7 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB21_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB21_11 ; RV64ZVE32F-NEXT: .LBB21_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 @@ -1981,69 +1991,71 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB29_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB29_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB29_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB29_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB29_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB29_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB29_14 -; RV64ZVE32F-NEXT: .LBB29_8: # %else8 +; RV64ZVE32F-NEXT: .LBB29_7: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB29_10 -; RV64ZVE32F-NEXT: .LBB29_9: # %cond.store9 +; RV64ZVE32F-NEXT: beqz a2, .LBB29_9 +; RV64ZVE32F-NEXT: .LBB29_8: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB29_10: # %else10 +; RV64ZVE32F-NEXT: .LBB29_9: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB29_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB29_16 -; RV64ZVE32F-NEXT: .LBB29_12: # %else14 +; RV64ZVE32F-NEXT: .LBB29_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB29_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB29_6 ; RV64ZVE32F-NEXT: .LBB29_13: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB29_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB29_7 ; RV64ZVE32F-NEXT: .LBB29_14: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB29_9 -; RV64ZVE32F-NEXT: j .LBB29_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB29_8 +; RV64ZVE32F-NEXT: j .LBB29_9 ; RV64ZVE32F-NEXT: .LBB29_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -2052,7 +2064,7 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB29_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB29_11 ; RV64ZVE32F-NEXT: .LBB29_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 @@ -2111,69 +2123,71 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB30_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB30_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB30_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB30_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB30_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB30_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB30_14 -; RV64ZVE32F-NEXT: .LBB30_8: # %else8 +; RV64ZVE32F-NEXT: .LBB30_7: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB30_10 -; RV64ZVE32F-NEXT: .LBB30_9: # %cond.store9 +; RV64ZVE32F-NEXT: beqz a2, .LBB30_9 +; RV64ZVE32F-NEXT: .LBB30_8: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB30_10: # %else10 +; RV64ZVE32F-NEXT: .LBB30_9: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB30_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB30_16 -; RV64ZVE32F-NEXT: .LBB30_12: # %else14 +; RV64ZVE32F-NEXT: .LBB30_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB30_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB30_6 ; RV64ZVE32F-NEXT: .LBB30_13: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB30_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB30_7 ; RV64ZVE32F-NEXT: .LBB30_14: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB30_9 -; RV64ZVE32F-NEXT: j .LBB30_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB30_8 +; RV64ZVE32F-NEXT: j .LBB30_9 ; RV64ZVE32F-NEXT: .LBB30_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -2182,7 +2196,7 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB30_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB30_11 ; RV64ZVE32F-NEXT: .LBB30_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 @@ -2244,53 +2258,55 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB31_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB31_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: andi a2, a2, 255 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB31_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB31_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB31_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB31_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB31_14 -; RV64ZVE32F-NEXT: .LBB31_8: # %else8 +; RV64ZVE32F-NEXT: .LBB31_7: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB31_10 -; RV64ZVE32F-NEXT: .LBB31_9: # %cond.store9 +; RV64ZVE32F-NEXT: beqz a2, .LBB31_9 +; RV64ZVE32F-NEXT: .LBB31_8: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB31_10: # %else10 +; RV64ZVE32F-NEXT: .LBB31_9: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB31_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB31_16 -; RV64ZVE32F-NEXT: .LBB31_12: # %else14 +; RV64ZVE32F-NEXT: .LBB31_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB31_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB31_6 ; RV64ZVE32F-NEXT: .LBB31_13: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -2298,10 +2314,10 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB31_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB31_7 ; RV64ZVE32F-NEXT: .LBB31_14: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -2309,8 +2325,8 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB31_9 -; RV64ZVE32F-NEXT: j .LBB31_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB31_8 +; RV64ZVE32F-NEXT: j .LBB31_9 ; RV64ZVE32F-NEXT: .LBB31_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: andi a2, a2, 255 @@ -2320,7 +2336,7 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB31_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB31_11 ; RV64ZVE32F-NEXT: .LBB31_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 @@ -2382,69 +2398,71 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB32_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB32_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB32_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB32_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB32_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB32_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB32_14 -; RV64ZVE32F-NEXT: .LBB32_8: # %else8 +; RV64ZVE32F-NEXT: .LBB32_7: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB32_10 -; RV64ZVE32F-NEXT: .LBB32_9: # %cond.store9 +; RV64ZVE32F-NEXT: beqz a2, .LBB32_9 +; RV64ZVE32F-NEXT: .LBB32_8: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB32_10: # %else10 +; RV64ZVE32F-NEXT: .LBB32_9: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB32_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB32_16 -; RV64ZVE32F-NEXT: .LBB32_12: # %else14 +; RV64ZVE32F-NEXT: .LBB32_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB32_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB32_6 ; RV64ZVE32F-NEXT: .LBB32_13: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB32_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB32_7 ; RV64ZVE32F-NEXT: .LBB32_14: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB32_9 -; RV64ZVE32F-NEXT: j .LBB32_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB32_8 +; RV64ZVE32F-NEXT: j .LBB32_9 ; RV64ZVE32F-NEXT: .LBB32_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -2453,7 +2471,7 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB32_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB32_11 ; RV64ZVE32F-NEXT: .LBB32_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 @@ -2513,69 +2531,71 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB33_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB33_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB33_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB33_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB33_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB33_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB33_14 -; RV64ZVE32F-NEXT: .LBB33_8: # %else8 +; RV64ZVE32F-NEXT: .LBB33_7: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB33_10 -; RV64ZVE32F-NEXT: .LBB33_9: # %cond.store9 +; RV64ZVE32F-NEXT: beqz a2, .LBB33_9 +; RV64ZVE32F-NEXT: .LBB33_8: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB33_10: # %else10 +; RV64ZVE32F-NEXT: .LBB33_9: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB33_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB33_16 -; RV64ZVE32F-NEXT: .LBB33_12: # %else14 +; RV64ZVE32F-NEXT: .LBB33_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB33_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB33_6 ; RV64ZVE32F-NEXT: .LBB33_13: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB33_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB33_7 ; RV64ZVE32F-NEXT: .LBB33_14: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB33_9 -; RV64ZVE32F-NEXT: j .LBB33_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB33_8 +; RV64ZVE32F-NEXT: j .LBB33_9 ; RV64ZVE32F-NEXT: .LBB33_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -2584,7 +2604,7 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB33_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB33_11 ; RV64ZVE32F-NEXT: .LBB33_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 @@ -2649,53 +2669,55 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v12, (a3) ; RV64ZVE32F-NEXT: .LBB34_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a3, a2, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB34_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a3, v11 -; RV64ZVE32F-NEXT: and a3, a3, a1 -; RV64ZVE32F-NEXT: slli a3, a3, 2 -; RV64ZVE32F-NEXT: add a3, a0, a3 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v12, (a3) -; RV64ZVE32F-NEXT: .LBB34_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB34_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 ; RV64ZVE32F-NEXT: bnez a3, .LBB34_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB34_6: # %else6 ; RV64ZVE32F-NEXT: andi a3, a2, 16 ; RV64ZVE32F-NEXT: bnez a3, .LBB34_14 -; RV64ZVE32F-NEXT: .LBB34_8: # %else8 +; RV64ZVE32F-NEXT: .LBB34_7: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: beqz a3, .LBB34_10 -; RV64ZVE32F-NEXT: .LBB34_9: # %cond.store9 +; RV64ZVE32F-NEXT: beqz a3, .LBB34_9 +; RV64ZVE32F-NEXT: .LBB34_8: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 ; RV64ZVE32F-NEXT: vse32.v v12, (a3) -; RV64ZVE32F-NEXT: .LBB34_10: # %else10 +; RV64ZVE32F-NEXT: .LBB34_9: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 ; RV64ZVE32F-NEXT: bnez a3, .LBB34_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a2, a2, -128 ; RV64ZVE32F-NEXT: bnez a2, .LBB34_16 -; RV64ZVE32F-NEXT: .LBB34_12: # %else14 +; RV64ZVE32F-NEXT: .LBB34_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB34_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v12, (a3) +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: beqz a3, .LBB34_6 ; RV64ZVE32F-NEXT: .LBB34_13: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 @@ -2703,10 +2725,10 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v12, (a3) ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB34_8 +; RV64ZVE32F-NEXT: beqz a3, .LBB34_7 ; RV64ZVE32F-NEXT: .LBB34_14: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: vmv.x.s a3, v11 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 @@ -2714,8 +2736,8 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vse32.v v12, (a3) ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB34_9 -; RV64ZVE32F-NEXT: j .LBB34_10 +; RV64ZVE32F-NEXT: bnez a3, .LBB34_8 +; RV64ZVE32F-NEXT: j .LBB34_9 ; RV64ZVE32F-NEXT: .LBB34_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: and a3, a3, a1 @@ -2725,7 +2747,7 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 ; RV64ZVE32F-NEXT: vse32.v v12, (a3) ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB34_12 +; RV64ZVE32F-NEXT: beqz a2, .LBB34_11 ; RV64ZVE32F-NEXT: .LBB34_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 @@ -3619,68 +3641,66 @@ ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t1, 0(a0) ; RV64ZVE32F-NEXT: .LBB42_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a0, a4, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a0, .LBB42_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 -; RV64ZVE32F-NEXT: slli a0, a0, 3 -; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd t0, 0(a0) -; RV64ZVE32F-NEXT: .LBB42_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB42_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a0, a4, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB42_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB42_6: # %else6 ; RV64ZVE32F-NEXT: andi a0, a4, 16 ; RV64ZVE32F-NEXT: bnez a0, .LBB42_14 -; RV64ZVE32F-NEXT: .LBB42_8: # %else8 +; RV64ZVE32F-NEXT: .LBB42_7: # %else8 ; RV64ZVE32F-NEXT: andi a0, a4, 32 -; RV64ZVE32F-NEXT: beqz a0, .LBB42_10 -; RV64ZVE32F-NEXT: .LBB42_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: beqz a0, .LBB42_9 +; RV64ZVE32F-NEXT: .LBB42_8: # %cond.store9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a5, 0(a0) -; RV64ZVE32F-NEXT: .LBB42_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: .LBB42_9: # %else10 ; RV64ZVE32F-NEXT: andi a0, a4, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB42_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a0, a4, -128 ; RV64ZVE32F-NEXT: bnez a0, .LBB42_16 -; RV64ZVE32F-NEXT: .LBB42_12: # %else14 +; RV64ZVE32F-NEXT: .LBB42_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB42_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t0, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB42_6 ; RV64ZVE32F-NEXT: .LBB42_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a7, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, 16 -; RV64ZVE32F-NEXT: beqz a0, .LBB42_8 +; RV64ZVE32F-NEXT: beqz a0, .LBB42_7 ; RV64ZVE32F-NEXT: .LBB42_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a6, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, 32 -; RV64ZVE32F-NEXT: bnez a0, .LBB42_9 -; RV64ZVE32F-NEXT: j .LBB42_10 +; RV64ZVE32F-NEXT: bnez a0, .LBB42_8 +; RV64ZVE32F-NEXT: j .LBB42_9 ; RV64ZVE32F-NEXT: .LBB42_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, -128 -; RV64ZVE32F-NEXT: beqz a0, .LBB42_12 +; RV64ZVE32F-NEXT: beqz a0, .LBB42_11 ; RV64ZVE32F-NEXT: .LBB42_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 @@ -3865,68 +3885,66 @@ ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t1, 0(a0) ; RV64ZVE32F-NEXT: .LBB43_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a0, a4, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a0, .LBB43_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 -; RV64ZVE32F-NEXT: slli a0, a0, 3 -; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd t0, 0(a0) -; RV64ZVE32F-NEXT: .LBB43_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB43_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a0, a4, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB43_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB43_6: # %else6 ; RV64ZVE32F-NEXT: andi a0, a4, 16 ; RV64ZVE32F-NEXT: bnez a0, .LBB43_14 -; RV64ZVE32F-NEXT: .LBB43_8: # %else8 +; RV64ZVE32F-NEXT: .LBB43_7: # %else8 ; RV64ZVE32F-NEXT: andi a0, a4, 32 -; RV64ZVE32F-NEXT: beqz a0, .LBB43_10 -; RV64ZVE32F-NEXT: .LBB43_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: beqz a0, .LBB43_9 +; RV64ZVE32F-NEXT: .LBB43_8: # %cond.store9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a5, 0(a0) -; RV64ZVE32F-NEXT: .LBB43_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: .LBB43_9: # %else10 ; RV64ZVE32F-NEXT: andi a0, a4, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB43_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a0, a4, -128 ; RV64ZVE32F-NEXT: bnez a0, .LBB43_16 -; RV64ZVE32F-NEXT: .LBB43_12: # %else14 +; RV64ZVE32F-NEXT: .LBB43_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB43_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t0, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB43_6 ; RV64ZVE32F-NEXT: .LBB43_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a7, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, 16 -; RV64ZVE32F-NEXT: beqz a0, .LBB43_8 +; RV64ZVE32F-NEXT: beqz a0, .LBB43_7 ; RV64ZVE32F-NEXT: .LBB43_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a6, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, 32 -; RV64ZVE32F-NEXT: bnez a0, .LBB43_9 -; RV64ZVE32F-NEXT: j .LBB43_10 +; RV64ZVE32F-NEXT: bnez a0, .LBB43_8 +; RV64ZVE32F-NEXT: j .LBB43_9 ; RV64ZVE32F-NEXT: .LBB43_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, -128 -; RV64ZVE32F-NEXT: beqz a0, .LBB43_12 +; RV64ZVE32F-NEXT: beqz a0, .LBB43_11 ; RV64ZVE32F-NEXT: .LBB43_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 @@ -4114,65 +4132,63 @@ ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t1, 0(a0) ; RV64ZVE32F-NEXT: .LBB44_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a0, a4, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a0, .LBB44_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 -; RV64ZVE32F-NEXT: andi a0, a0, 255 -; RV64ZVE32F-NEXT: slli a0, a0, 3 -; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd t0, 0(a0) -; RV64ZVE32F-NEXT: .LBB44_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB44_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a0, a4, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB44_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB44_6: # %else6 ; RV64ZVE32F-NEXT: andi a0, a4, 16 ; RV64ZVE32F-NEXT: bnez a0, .LBB44_14 -; RV64ZVE32F-NEXT: .LBB44_8: # %else8 +; RV64ZVE32F-NEXT: .LBB44_7: # %else8 ; RV64ZVE32F-NEXT: andi a0, a4, 32 -; RV64ZVE32F-NEXT: beqz a0, .LBB44_10 -; RV64ZVE32F-NEXT: .LBB44_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: beqz a0, .LBB44_9 +; RV64ZVE32F-NEXT: .LBB44_8: # %cond.store9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: andi a0, a0, 255 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a5, 0(a0) -; RV64ZVE32F-NEXT: .LBB44_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: .LBB44_9: # %else10 ; RV64ZVE32F-NEXT: andi a0, a4, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB44_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a0, a4, -128 ; RV64ZVE32F-NEXT: bnez a0, .LBB44_16 -; RV64ZVE32F-NEXT: .LBB44_12: # %else14 +; RV64ZVE32F-NEXT: .LBB44_11: # %else14 ; RV64ZVE32F-NEXT: ret -; RV64ZVE32F-NEXT: .LBB44_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: .LBB44_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: andi a0, a0, 255 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd a7, 0(a0) -; RV64ZVE32F-NEXT: andi a0, a4, 16 -; RV64ZVE32F-NEXT: beqz a0, .LBB44_8 -; RV64ZVE32F-NEXT: .LBB44_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: sd t0, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB44_6 +; RV64ZVE32F-NEXT: .LBB44_13: # %cond.store5 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: andi a0, a0, 255 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd a7, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, 16 +; RV64ZVE32F-NEXT: beqz a0, .LBB44_7 +; RV64ZVE32F-NEXT: .LBB44_14: # %cond.store7 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: andi a0, a0, 255 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a6, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, 32 -; RV64ZVE32F-NEXT: bnez a0, .LBB44_9 -; RV64ZVE32F-NEXT: j .LBB44_10 +; RV64ZVE32F-NEXT: bnez a0, .LBB44_8 +; RV64ZVE32F-NEXT: j .LBB44_9 ; RV64ZVE32F-NEXT: .LBB44_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: andi a0, a0, 255 @@ -4180,7 +4196,7 @@ ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, -128 -; RV64ZVE32F-NEXT: beqz a0, .LBB44_12 +; RV64ZVE32F-NEXT: beqz a0, .LBB44_11 ; RV64ZVE32F-NEXT: .LBB44_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 @@ -4368,68 +4384,66 @@ ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t1, 0(a0) ; RV64ZVE32F-NEXT: .LBB45_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a0, a4, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a0, .LBB45_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 -; RV64ZVE32F-NEXT: slli a0, a0, 3 -; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd t0, 0(a0) -; RV64ZVE32F-NEXT: .LBB45_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB45_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a0, a4, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB45_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB45_6: # %else6 ; RV64ZVE32F-NEXT: andi a0, a4, 16 ; RV64ZVE32F-NEXT: bnez a0, .LBB45_14 -; RV64ZVE32F-NEXT: .LBB45_8: # %else8 +; RV64ZVE32F-NEXT: .LBB45_7: # %else8 ; RV64ZVE32F-NEXT: andi a0, a4, 32 -; RV64ZVE32F-NEXT: beqz a0, .LBB45_10 -; RV64ZVE32F-NEXT: .LBB45_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: beqz a0, .LBB45_9 +; RV64ZVE32F-NEXT: .LBB45_8: # %cond.store9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a5, 0(a0) -; RV64ZVE32F-NEXT: .LBB45_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: .LBB45_9: # %else10 ; RV64ZVE32F-NEXT: andi a0, a4, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB45_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a0, a4, -128 ; RV64ZVE32F-NEXT: bnez a0, .LBB45_16 -; RV64ZVE32F-NEXT: .LBB45_12: # %else14 +; RV64ZVE32F-NEXT: .LBB45_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB45_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t0, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB45_6 ; RV64ZVE32F-NEXT: .LBB45_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a7, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, 16 -; RV64ZVE32F-NEXT: beqz a0, .LBB45_8 +; RV64ZVE32F-NEXT: beqz a0, .LBB45_7 ; RV64ZVE32F-NEXT: .LBB45_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a6, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, 32 -; RV64ZVE32F-NEXT: bnez a0, .LBB45_9 -; RV64ZVE32F-NEXT: j .LBB45_10 +; RV64ZVE32F-NEXT: bnez a0, .LBB45_8 +; RV64ZVE32F-NEXT: j .LBB45_9 ; RV64ZVE32F-NEXT: .LBB45_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, -128 -; RV64ZVE32F-NEXT: beqz a0, .LBB45_12 +; RV64ZVE32F-NEXT: beqz a0, .LBB45_11 ; RV64ZVE32F-NEXT: .LBB45_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 @@ -4615,68 +4629,66 @@ ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t1, 0(a0) ; RV64ZVE32F-NEXT: .LBB46_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a0, a4, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a0, .LBB46_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 -; RV64ZVE32F-NEXT: slli a0, a0, 3 -; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd t0, 0(a0) -; RV64ZVE32F-NEXT: .LBB46_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB46_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a0, a4, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB46_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB46_6: # %else6 ; RV64ZVE32F-NEXT: andi a0, a4, 16 ; RV64ZVE32F-NEXT: bnez a0, .LBB46_14 -; RV64ZVE32F-NEXT: .LBB46_8: # %else8 +; RV64ZVE32F-NEXT: .LBB46_7: # %else8 ; RV64ZVE32F-NEXT: andi a0, a4, 32 -; RV64ZVE32F-NEXT: beqz a0, .LBB46_10 -; RV64ZVE32F-NEXT: .LBB46_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: beqz a0, .LBB46_9 +; RV64ZVE32F-NEXT: .LBB46_8: # %cond.store9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a5, 0(a0) -; RV64ZVE32F-NEXT: .LBB46_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: .LBB46_9: # %else10 ; RV64ZVE32F-NEXT: andi a0, a4, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB46_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a0, a4, -128 ; RV64ZVE32F-NEXT: bnez a0, .LBB46_16 -; RV64ZVE32F-NEXT: .LBB46_12: # %else14 +; RV64ZVE32F-NEXT: .LBB46_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB46_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t0, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a4, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB46_6 ; RV64ZVE32F-NEXT: .LBB46_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a7, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, 16 -; RV64ZVE32F-NEXT: beqz a0, .LBB46_8 +; RV64ZVE32F-NEXT: beqz a0, .LBB46_7 ; RV64ZVE32F-NEXT: .LBB46_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a6, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, 32 -; RV64ZVE32F-NEXT: bnez a0, .LBB46_9 -; RV64ZVE32F-NEXT: j .LBB46_10 +; RV64ZVE32F-NEXT: bnez a0, .LBB46_8 +; RV64ZVE32F-NEXT: j .LBB46_9 ; RV64ZVE32F-NEXT: .LBB46_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a4, -128 -; RV64ZVE32F-NEXT: beqz a0, .LBB46_12 +; RV64ZVE32F-NEXT: beqz a0, .LBB46_11 ; RV64ZVE32F-NEXT: .LBB46_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 @@ -4867,65 +4879,63 @@ ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t2, 0(a0) ; RV64ZVE32F-NEXT: .LBB47_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a0, a5, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a0, .LBB47_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 -; RV64ZVE32F-NEXT: and a0, a0, a4 -; RV64ZVE32F-NEXT: slli a0, a0, 3 -; RV64ZVE32F-NEXT: add a0, a1, a0 -; RV64ZVE32F-NEXT: sd t1, 0(a0) -; RV64ZVE32F-NEXT: .LBB47_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a0, .LBB47_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a0, a5, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a0, .LBB47_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB47_6: # %else6 ; RV64ZVE32F-NEXT: andi a0, a5, 16 ; RV64ZVE32F-NEXT: bnez a0, .LBB47_14 -; RV64ZVE32F-NEXT: .LBB47_8: # %else8 +; RV64ZVE32F-NEXT: .LBB47_7: # %else8 ; RV64ZVE32F-NEXT: andi a0, a5, 32 -; RV64ZVE32F-NEXT: beqz a0, .LBB47_10 -; RV64ZVE32F-NEXT: .LBB47_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: beqz a0, .LBB47_9 +; RV64ZVE32F-NEXT: .LBB47_8: # %cond.store9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: and a0, a0, a4 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a6, 0(a0) -; RV64ZVE32F-NEXT: .LBB47_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: .LBB47_9: # %else10 ; RV64ZVE32F-NEXT: andi a0, a5, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a0, .LBB47_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a0, a5, -128 ; RV64ZVE32F-NEXT: bnez a0, .LBB47_16 -; RV64ZVE32F-NEXT: .LBB47_12: # %else14 +; RV64ZVE32F-NEXT: .LBB47_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB47_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: and a0, a0, a4 +; RV64ZVE32F-NEXT: slli a0, a0, 3 +; RV64ZVE32F-NEXT: add a0, a1, a0 +; RV64ZVE32F-NEXT: sd t1, 0(a0) +; RV64ZVE32F-NEXT: andi a0, a5, 8 +; RV64ZVE32F-NEXT: beqz a0, .LBB47_6 ; RV64ZVE32F-NEXT: .LBB47_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a0, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: and a0, a0, a4 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd t0, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a5, 16 -; RV64ZVE32F-NEXT: beqz a0, .LBB47_8 +; RV64ZVE32F-NEXT: beqz a0, .LBB47_7 ; RV64ZVE32F-NEXT: .LBB47_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a0, v8 +; RV64ZVE32F-NEXT: vmv.x.s a0, v9 ; RV64ZVE32F-NEXT: and a0, a0, a4 ; RV64ZVE32F-NEXT: slli a0, a0, 3 ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a7, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a5, 32 -; RV64ZVE32F-NEXT: bnez a0, .LBB47_9 -; RV64ZVE32F-NEXT: j .LBB47_10 +; RV64ZVE32F-NEXT: bnez a0, .LBB47_8 +; RV64ZVE32F-NEXT: j .LBB47_9 ; RV64ZVE32F-NEXT: .LBB47_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 ; RV64ZVE32F-NEXT: and a0, a0, a4 @@ -4933,7 +4943,7 @@ ; RV64ZVE32F-NEXT: add a0, a1, a0 ; RV64ZVE32F-NEXT: sd a3, 0(a0) ; RV64ZVE32F-NEXT: andi a0, a5, -128 -; RV64ZVE32F-NEXT: beqz a0, .LBB47_12 +; RV64ZVE32F-NEXT: beqz a0, .LBB47_11 ; RV64ZVE32F-NEXT: .LBB47_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a0, v8 @@ -6332,69 +6342,71 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB58_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB58_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 -; RV64ZVE32F-NEXT: vse16.v v11, (a2) -; RV64ZVE32F-NEXT: .LBB58_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB58_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB58_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB58_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB58_14 -; RV64ZVE32F-NEXT: .LBB58_8: # %else8 +; RV64ZVE32F-NEXT: .LBB58_7: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB58_10 -; RV64ZVE32F-NEXT: .LBB58_9: # %cond.store9 +; RV64ZVE32F-NEXT: beqz a2, .LBB58_9 +; RV64ZVE32F-NEXT: .LBB58_8: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB58_10: # %else10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB58_9: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB58_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB58_16 -; RV64ZVE32F-NEXT: .LBB58_12: # %else14 +; RV64ZVE32F-NEXT: .LBB58_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB58_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB58_6 ; RV64ZVE32F-NEXT: .LBB58_13: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB58_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB58_7 ; RV64ZVE32F-NEXT: .LBB58_14: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB58_9 -; RV64ZVE32F-NEXT: j .LBB58_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB58_8 +; RV64ZVE32F-NEXT: j .LBB58_9 ; RV64ZVE32F-NEXT: .LBB58_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 @@ -6403,7 +6415,7 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB58_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB58_11 ; RV64ZVE32F-NEXT: .LBB58_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 @@ -6463,69 +6475,71 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB59_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB59_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 -; RV64ZVE32F-NEXT: vse16.v v11, (a2) -; RV64ZVE32F-NEXT: .LBB59_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB59_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB59_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB59_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB59_14 -; RV64ZVE32F-NEXT: .LBB59_8: # %else8 +; RV64ZVE32F-NEXT: .LBB59_7: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB59_10 -; RV64ZVE32F-NEXT: .LBB59_9: # %cond.store9 +; RV64ZVE32F-NEXT: beqz a2, .LBB59_9 +; RV64ZVE32F-NEXT: .LBB59_8: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB59_10: # %else10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB59_9: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB59_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB59_16 -; RV64ZVE32F-NEXT: .LBB59_12: # %else14 +; RV64ZVE32F-NEXT: .LBB59_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB59_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB59_6 ; RV64ZVE32F-NEXT: .LBB59_13: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB59_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB59_7 ; RV64ZVE32F-NEXT: .LBB59_14: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB59_9 -; RV64ZVE32F-NEXT: j .LBB59_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB59_8 +; RV64ZVE32F-NEXT: j .LBB59_9 ; RV64ZVE32F-NEXT: .LBB59_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 @@ -6534,7 +6548,7 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB59_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB59_11 ; RV64ZVE32F-NEXT: .LBB59_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 @@ -6597,73 +6611,75 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB60_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB60_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: andi a2, a2, 255 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 -; RV64ZVE32F-NEXT: vse16.v v11, (a2) -; RV64ZVE32F-NEXT: .LBB60_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB60_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB60_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB60_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB60_14 -; RV64ZVE32F-NEXT: .LBB60_8: # %else8 +; RV64ZVE32F-NEXT: .LBB60_7: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB60_10 -; RV64ZVE32F-NEXT: .LBB60_9: # %cond.store9 +; RV64ZVE32F-NEXT: beqz a2, .LBB60_9 +; RV64ZVE32F-NEXT: .LBB60_8: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB60_10: # %else10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB60_9: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB60_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB60_16 -; RV64ZVE32F-NEXT: .LBB60_12: # %else14 +; RV64ZVE32F-NEXT: .LBB60_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB60_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB60_6 ; RV64ZVE32F-NEXT: .LBB60_13: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB60_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB60_7 ; RV64ZVE32F-NEXT: .LBB60_14: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB60_9 -; RV64ZVE32F-NEXT: j .LBB60_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB60_8 +; RV64ZVE32F-NEXT: j .LBB60_9 ; RV64ZVE32F-NEXT: .LBB60_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: andi a2, a2, 255 @@ -6673,7 +6689,7 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB60_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB60_11 ; RV64ZVE32F-NEXT: .LBB60_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 @@ -6734,68 +6750,70 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB61_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB61_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: slli a2, a2, 1 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 -; RV64ZVE32F-NEXT: vse16.v v11, (a2) -; RV64ZVE32F-NEXT: .LBB61_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB61_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB61_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB61_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB61_14 -; RV64ZVE32F-NEXT: .LBB61_8: # %else8 +; RV64ZVE32F-NEXT: .LBB61_7: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB61_10 -; RV64ZVE32F-NEXT: .LBB61_9: # %cond.store9 +; RV64ZVE32F-NEXT: beqz a2, .LBB61_9 +; RV64ZVE32F-NEXT: .LBB61_8: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB61_10: # %else10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB61_9: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB61_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB61_16 -; RV64ZVE32F-NEXT: .LBB61_12: # %else14 +; RV64ZVE32F-NEXT: .LBB61_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB61_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: slli a2, a2, 1 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2 +; RV64ZVE32F-NEXT: vse16.v v11, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB61_6 ; RV64ZVE32F-NEXT: .LBB61_13: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB61_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB61_7 ; RV64ZVE32F-NEXT: .LBB61_14: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 1 ; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4 -; RV64ZVE32F-NEXT: vse16.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 +; RV64ZVE32F-NEXT: vse16.v v9, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB61_9 -; RV64ZVE32F-NEXT: j .LBB61_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB61_8 +; RV64ZVE32F-NEXT: j .LBB61_9 ; RV64ZVE32F-NEXT: .LBB61_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 1 @@ -6804,7 +6822,7 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6 ; RV64ZVE32F-NEXT: vse16.v v10, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB61_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB61_11 ; RV64ZVE32F-NEXT: .LBB61_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 @@ -7178,69 +7196,71 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB68_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB68_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB68_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB68_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB68_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB68_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB68_14 -; RV64ZVE32F-NEXT: .LBB68_8: # %else8 +; RV64ZVE32F-NEXT: .LBB68_7: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB68_10 -; RV64ZVE32F-NEXT: .LBB68_9: # %cond.store9 +; RV64ZVE32F-NEXT: beqz a2, .LBB68_9 +; RV64ZVE32F-NEXT: .LBB68_8: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB68_10: # %else10 +; RV64ZVE32F-NEXT: .LBB68_9: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB68_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB68_16 -; RV64ZVE32F-NEXT: .LBB68_12: # %else14 +; RV64ZVE32F-NEXT: .LBB68_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB68_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB68_6 ; RV64ZVE32F-NEXT: .LBB68_13: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB68_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB68_7 ; RV64ZVE32F-NEXT: .LBB68_14: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB68_9 -; RV64ZVE32F-NEXT: j .LBB68_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB68_8 +; RV64ZVE32F-NEXT: j .LBB68_9 ; RV64ZVE32F-NEXT: .LBB68_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -7249,7 +7269,7 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB68_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB68_11 ; RV64ZVE32F-NEXT: .LBB68_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 @@ -7308,69 +7328,71 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB69_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB69_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB69_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB69_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB69_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB69_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB69_14 -; RV64ZVE32F-NEXT: .LBB69_8: # %else8 +; RV64ZVE32F-NEXT: .LBB69_7: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB69_10 -; RV64ZVE32F-NEXT: .LBB69_9: # %cond.store9 +; RV64ZVE32F-NEXT: beqz a2, .LBB69_9 +; RV64ZVE32F-NEXT: .LBB69_8: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB69_10: # %else10 +; RV64ZVE32F-NEXT: .LBB69_9: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB69_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB69_16 -; RV64ZVE32F-NEXT: .LBB69_12: # %else14 +; RV64ZVE32F-NEXT: .LBB69_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB69_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB69_6 ; RV64ZVE32F-NEXT: .LBB69_13: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB69_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB69_7 ; RV64ZVE32F-NEXT: .LBB69_14: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB69_9 -; RV64ZVE32F-NEXT: j .LBB69_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB69_8 +; RV64ZVE32F-NEXT: j .LBB69_9 ; RV64ZVE32F-NEXT: .LBB69_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -7379,7 +7401,7 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB69_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB69_11 ; RV64ZVE32F-NEXT: .LBB69_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 @@ -7441,53 +7463,55 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB70_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB70_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: andi a2, a2, 255 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB70_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB70_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB70_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB70_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB70_14 -; RV64ZVE32F-NEXT: .LBB70_8: # %else8 +; RV64ZVE32F-NEXT: .LBB70_7: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB70_10 -; RV64ZVE32F-NEXT: .LBB70_9: # %cond.store9 +; RV64ZVE32F-NEXT: beqz a2, .LBB70_9 +; RV64ZVE32F-NEXT: .LBB70_8: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB70_10: # %else10 +; RV64ZVE32F-NEXT: .LBB70_9: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB70_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB70_16 -; RV64ZVE32F-NEXT: .LBB70_12: # %else14 +; RV64ZVE32F-NEXT: .LBB70_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB70_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB70_6 ; RV64ZVE32F-NEXT: .LBB70_13: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -7495,10 +7519,10 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB70_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB70_7 ; RV64ZVE32F-NEXT: .LBB70_14: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -7506,8 +7530,8 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB70_9 -; RV64ZVE32F-NEXT: j .LBB70_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB70_8 +; RV64ZVE32F-NEXT: j .LBB70_9 ; RV64ZVE32F-NEXT: .LBB70_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: andi a2, a2, 255 @@ -7517,7 +7541,7 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB70_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB70_11 ; RV64ZVE32F-NEXT: .LBB70_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 @@ -7579,69 +7603,71 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB71_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB71_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB71_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB71_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB71_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB71_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB71_14 -; RV64ZVE32F-NEXT: .LBB71_8: # %else8 +; RV64ZVE32F-NEXT: .LBB71_7: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB71_10 -; RV64ZVE32F-NEXT: .LBB71_9: # %cond.store9 +; RV64ZVE32F-NEXT: beqz a2, .LBB71_9 +; RV64ZVE32F-NEXT: .LBB71_8: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB71_10: # %else10 +; RV64ZVE32F-NEXT: .LBB71_9: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB71_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB71_16 -; RV64ZVE32F-NEXT: .LBB71_12: # %else14 +; RV64ZVE32F-NEXT: .LBB71_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB71_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB71_6 ; RV64ZVE32F-NEXT: .LBB71_13: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB71_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB71_7 ; RV64ZVE32F-NEXT: .LBB71_14: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB71_9 -; RV64ZVE32F-NEXT: j .LBB71_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB71_8 +; RV64ZVE32F-NEXT: j .LBB71_9 ; RV64ZVE32F-NEXT: .LBB71_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -7650,7 +7676,7 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB71_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB71_11 ; RV64ZVE32F-NEXT: .LBB71_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 @@ -7710,69 +7736,71 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB72_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB72_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: slli a2, a2, 2 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB72_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB72_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB72_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB72_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB72_14 -; RV64ZVE32F-NEXT: .LBB72_8: # %else8 +; RV64ZVE32F-NEXT: .LBB72_7: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB72_10 -; RV64ZVE32F-NEXT: .LBB72_9: # %cond.store9 +; RV64ZVE32F-NEXT: beqz a2, .LBB72_9 +; RV64ZVE32F-NEXT: .LBB72_8: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB72_10: # %else10 +; RV64ZVE32F-NEXT: .LBB72_9: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB72_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB72_16 -; RV64ZVE32F-NEXT: .LBB72_12: # %else14 +; RV64ZVE32F-NEXT: .LBB72_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB72_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: slli a2, a2, 2 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB72_6 ; RV64ZVE32F-NEXT: .LBB72_13: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB72_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB72_7 ; RV64ZVE32F-NEXT: .LBB72_14: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: slli a2, a2, 2 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB72_9 -; RV64ZVE32F-NEXT: j .LBB72_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB72_8 +; RV64ZVE32F-NEXT: j .LBB72_9 ; RV64ZVE32F-NEXT: .LBB72_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: slli a2, a2, 2 @@ -7781,7 +7809,7 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 ; RV64ZVE32F-NEXT: vse32.v v12, (a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB72_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB72_11 ; RV64ZVE32F-NEXT: .LBB72_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 @@ -7846,53 +7874,55 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 ; RV64ZVE32F-NEXT: vse32.v v12, (a3) ; RV64ZVE32F-NEXT: .LBB73_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a3, a2, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB73_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a3, v11 -; RV64ZVE32F-NEXT: and a3, a3, a1 -; RV64ZVE32F-NEXT: slli a3, a3, 2 -; RV64ZVE32F-NEXT: add a3, a0, a3 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 -; RV64ZVE32F-NEXT: vse32.v v12, (a3) -; RV64ZVE32F-NEXT: .LBB73_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB73_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 ; RV64ZVE32F-NEXT: bnez a3, .LBB73_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB73_6: # %else6 ; RV64ZVE32F-NEXT: andi a3, a2, 16 ; RV64ZVE32F-NEXT: bnez a3, .LBB73_14 -; RV64ZVE32F-NEXT: .LBB73_8: # %else8 +; RV64ZVE32F-NEXT: .LBB73_7: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: beqz a3, .LBB73_10 -; RV64ZVE32F-NEXT: .LBB73_9: # %cond.store9 +; RV64ZVE32F-NEXT: beqz a3, .LBB73_9 +; RV64ZVE32F-NEXT: .LBB73_8: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5 ; RV64ZVE32F-NEXT: vse32.v v12, (a3) -; RV64ZVE32F-NEXT: .LBB73_10: # %else10 +; RV64ZVE32F-NEXT: .LBB73_9: # %else10 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 ; RV64ZVE32F-NEXT: bnez a3, .LBB73_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a2, a2, -128 ; RV64ZVE32F-NEXT: bnez a2, .LBB73_16 -; RV64ZVE32F-NEXT: .LBB73_12: # %else14 +; RV64ZVE32F-NEXT: .LBB73_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB73_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 2 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse32.v v12, (a3) +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: beqz a3, .LBB73_6 ; RV64ZVE32F-NEXT: .LBB73_13: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 @@ -7900,10 +7930,10 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3 ; RV64ZVE32F-NEXT: vse32.v v12, (a3) ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB73_8 +; RV64ZVE32F-NEXT: beqz a3, .LBB73_7 ; RV64ZVE32F-NEXT: .LBB73_14: # %cond.store7 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a3, v10 +; RV64ZVE32F-NEXT: vmv.x.s a3, v11 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 2 ; RV64ZVE32F-NEXT: add a3, a0, a3 @@ -7911,8 +7941,8 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4 ; RV64ZVE32F-NEXT: vse32.v v12, (a3) ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB73_9 -; RV64ZVE32F-NEXT: j .LBB73_10 +; RV64ZVE32F-NEXT: bnez a3, .LBB73_8 +; RV64ZVE32F-NEXT: j .LBB73_9 ; RV64ZVE32F-NEXT: .LBB73_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a3, v10 ; RV64ZVE32F-NEXT: and a3, a3, a1 @@ -7922,7 +7952,7 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6 ; RV64ZVE32F-NEXT: vse32.v v12, (a3) ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB73_12 +; RV64ZVE32F-NEXT: beqz a2, .LBB73_11 ; RV64ZVE32F-NEXT: .LBB73_16: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 @@ -8677,68 +8707,66 @@ ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa1, 0(a2) ; RV64ZVE32F-NEXT: .LBB81_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB81_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 3 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa2, 0(a2) -; RV64ZVE32F-NEXT: .LBB81_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB81_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB81_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB81_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB81_14 -; RV64ZVE32F-NEXT: .LBB81_8: # %else8 +; RV64ZVE32F-NEXT: .LBB81_7: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB81_10 -; RV64ZVE32F-NEXT: .LBB81_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: beqz a2, .LBB81_9 +; RV64ZVE32F-NEXT: .LBB81_8: # %cond.store9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa5, 0(a2) -; RV64ZVE32F-NEXT: .LBB81_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: .LBB81_9: # %else10 ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB81_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB81_16 -; RV64ZVE32F-NEXT: .LBB81_12: # %else14 +; RV64ZVE32F-NEXT: .LBB81_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB81_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa2, 0(a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB81_6 ; RV64ZVE32F-NEXT: .LBB81_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa3, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB81_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB81_7 ; RV64ZVE32F-NEXT: .LBB81_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa4, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB81_9 -; RV64ZVE32F-NEXT: j .LBB81_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB81_8 +; RV64ZVE32F-NEXT: j .LBB81_9 ; RV64ZVE32F-NEXT: .LBB81_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa6, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB81_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB81_11 ; RV64ZVE32F-NEXT: .LBB81_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -8879,68 +8907,66 @@ ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa1, 0(a2) ; RV64ZVE32F-NEXT: .LBB82_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB82_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 3 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa2, 0(a2) -; RV64ZVE32F-NEXT: .LBB82_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB82_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB82_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB82_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB82_14 -; RV64ZVE32F-NEXT: .LBB82_8: # %else8 +; RV64ZVE32F-NEXT: .LBB82_7: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB82_10 -; RV64ZVE32F-NEXT: .LBB82_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: beqz a2, .LBB82_9 +; RV64ZVE32F-NEXT: .LBB82_8: # %cond.store9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa5, 0(a2) -; RV64ZVE32F-NEXT: .LBB82_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: .LBB82_9: # %else10 ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB82_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB82_16 -; RV64ZVE32F-NEXT: .LBB82_12: # %else14 +; RV64ZVE32F-NEXT: .LBB82_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB82_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa2, 0(a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB82_6 ; RV64ZVE32F-NEXT: .LBB82_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa3, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB82_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB82_7 ; RV64ZVE32F-NEXT: .LBB82_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa4, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB82_9 -; RV64ZVE32F-NEXT: j .LBB82_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB82_8 +; RV64ZVE32F-NEXT: j .LBB82_9 ; RV64ZVE32F-NEXT: .LBB82_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa6, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB82_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB82_11 ; RV64ZVE32F-NEXT: .LBB82_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -9084,65 +9110,63 @@ ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa1, 0(a2) ; RV64ZVE32F-NEXT: .LBB83_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB83_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: andi a2, a2, 255 -; RV64ZVE32F-NEXT: slli a2, a2, 3 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa2, 0(a2) -; RV64ZVE32F-NEXT: .LBB83_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB83_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB83_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB83_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB83_14 -; RV64ZVE32F-NEXT: .LBB83_8: # %else8 +; RV64ZVE32F-NEXT: .LBB83_7: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB83_10 -; RV64ZVE32F-NEXT: .LBB83_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: beqz a2, .LBB83_9 +; RV64ZVE32F-NEXT: .LBB83_8: # %cond.store9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa5, 0(a2) -; RV64ZVE32F-NEXT: .LBB83_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVE32F-NEXT: .LBB83_9: # %else10 ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB83_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB83_16 -; RV64ZVE32F-NEXT: .LBB83_12: # %else14 +; RV64ZVE32F-NEXT: .LBB83_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB83_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: andi a2, a2, 255 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa2, 0(a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB83_6 ; RV64ZVE32F-NEXT: .LBB83_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa3, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB83_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB83_7 ; RV64ZVE32F-NEXT: .LBB83_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: andi a2, a2, 255 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa4, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB83_9 -; RV64ZVE32F-NEXT: j .LBB83_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB83_8 +; RV64ZVE32F-NEXT: j .LBB83_9 ; RV64ZVE32F-NEXT: .LBB83_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: andi a2, a2, 255 @@ -9150,7 +9174,7 @@ ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa6, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB83_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB83_11 ; RV64ZVE32F-NEXT: .LBB83_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -9294,68 +9318,66 @@ ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa1, 0(a2) ; RV64ZVE32F-NEXT: .LBB84_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB84_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 3 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa2, 0(a2) -; RV64ZVE32F-NEXT: .LBB84_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB84_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB84_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB84_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB84_14 -; RV64ZVE32F-NEXT: .LBB84_8: # %else8 +; RV64ZVE32F-NEXT: .LBB84_7: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB84_10 -; RV64ZVE32F-NEXT: .LBB84_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: beqz a2, .LBB84_9 +; RV64ZVE32F-NEXT: .LBB84_8: # %cond.store9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa5, 0(a2) -; RV64ZVE32F-NEXT: .LBB84_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: .LBB84_9: # %else10 ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB84_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB84_16 -; RV64ZVE32F-NEXT: .LBB84_12: # %else14 +; RV64ZVE32F-NEXT: .LBB84_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB84_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa2, 0(a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB84_6 ; RV64ZVE32F-NEXT: .LBB84_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa3, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB84_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB84_7 ; RV64ZVE32F-NEXT: .LBB84_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa4, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB84_9 -; RV64ZVE32F-NEXT: j .LBB84_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB84_8 +; RV64ZVE32F-NEXT: j .LBB84_9 ; RV64ZVE32F-NEXT: .LBB84_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa6, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB84_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB84_11 ; RV64ZVE32F-NEXT: .LBB84_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -9497,68 +9519,66 @@ ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa1, 0(a2) ; RV64ZVE32F-NEXT: .LBB85_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB85_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 -; RV64ZVE32F-NEXT: slli a2, a2, 3 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: fsd fa2, 0(a2) -; RV64ZVE32F-NEXT: .LBB85_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB85_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB85_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB85_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 ; RV64ZVE32F-NEXT: bnez a2, .LBB85_14 -; RV64ZVE32F-NEXT: .LBB85_8: # %else8 +; RV64ZVE32F-NEXT: .LBB85_7: # %else8 ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: beqz a2, .LBB85_10 -; RV64ZVE32F-NEXT: .LBB85_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: beqz a2, .LBB85_9 +; RV64ZVE32F-NEXT: .LBB85_8: # %cond.store9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa5, 0(a2) -; RV64ZVE32F-NEXT: .LBB85_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: .LBB85_9: # %else10 ; RV64ZVE32F-NEXT: andi a2, a1, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a2, .LBB85_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a1, a1, -128 ; RV64ZVE32F-NEXT: bnez a1, .LBB85_16 -; RV64ZVE32F-NEXT: .LBB85_12: # %else14 +; RV64ZVE32F-NEXT: .LBB85_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB85_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: slli a2, a2, 3 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: fsd fa2, 0(a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB85_6 ; RV64ZVE32F-NEXT: .LBB85_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa3, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB85_8 +; RV64ZVE32F-NEXT: beqz a2, .LBB85_7 ; RV64ZVE32F-NEXT: .LBB85_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v8 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa4, 0(a2) ; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB85_9 -; RV64ZVE32F-NEXT: j .LBB85_10 +; RV64ZVE32F-NEXT: bnez a2, .LBB85_8 +; RV64ZVE32F-NEXT: j .LBB85_9 ; RV64ZVE32F-NEXT: .LBB85_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 ; RV64ZVE32F-NEXT: slli a2, a2, 3 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: fsd fa6, 0(a2) ; RV64ZVE32F-NEXT: andi a1, a1, -128 -; RV64ZVE32F-NEXT: beqz a1, .LBB85_12 +; RV64ZVE32F-NEXT: beqz a1, .LBB85_11 ; RV64ZVE32F-NEXT: .LBB85_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v8 @@ -9705,65 +9725,63 @@ ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: fsd fa1, 0(a3) ; RV64ZVE32F-NEXT: .LBB86_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma ; RV64ZVE32F-NEXT: andi a3, a2, 4 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; RV64ZVE32F-NEXT: beqz a3, .LBB86_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 -; RV64ZVE32F-NEXT: and a3, a3, a1 -; RV64ZVE32F-NEXT: slli a3, a3, 3 -; RV64ZVE32F-NEXT: add a3, a0, a3 -; RV64ZVE32F-NEXT: fsd fa2, 0(a3) -; RV64ZVE32F-NEXT: .LBB86_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: bnez a3, .LBB86_12 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a3, a2, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4 ; RV64ZVE32F-NEXT: bnez a3, .LBB86_13 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB86_6: # %else6 ; RV64ZVE32F-NEXT: andi a3, a2, 16 ; RV64ZVE32F-NEXT: bnez a3, .LBB86_14 -; RV64ZVE32F-NEXT: .LBB86_8: # %else8 +; RV64ZVE32F-NEXT: .LBB86_7: # %else8 ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: beqz a3, .LBB86_10 -; RV64ZVE32F-NEXT: .LBB86_9: # %cond.store9 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: beqz a3, .LBB86_9 +; RV64ZVE32F-NEXT: .LBB86_8: # %cond.store9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: fsd fa5, 0(a3) -; RV64ZVE32F-NEXT: .LBB86_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; RV64ZVE32F-NEXT: .LBB86_9: # %else10 ; RV64ZVE32F-NEXT: andi a3, a2, 64 -; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2 ; RV64ZVE32F-NEXT: bnez a3, .LBB86_15 -; RV64ZVE32F-NEXT: # %bb.11: # %else12 +; RV64ZVE32F-NEXT: # %bb.10: # %else12 ; RV64ZVE32F-NEXT: andi a2, a2, -128 ; RV64ZVE32F-NEXT: bnez a2, .LBB86_16 -; RV64ZVE32F-NEXT: .LBB86_12: # %else14 +; RV64ZVE32F-NEXT: .LBB86_11: # %else14 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB86_12: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: and a3, a3, a1 +; RV64ZVE32F-NEXT: slli a3, a3, 3 +; RV64ZVE32F-NEXT: add a3, a0, a3 +; RV64ZVE32F-NEXT: fsd fa2, 0(a3) +; RV64ZVE32F-NEXT: andi a3, a2, 8 +; RV64ZVE32F-NEXT: beqz a3, .LBB86_6 ; RV64ZVE32F-NEXT: .LBB86_13: # %cond.store5 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a3, v9 +; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: fsd fa3, 0(a3) ; RV64ZVE32F-NEXT: andi a3, a2, 16 -; RV64ZVE32F-NEXT: beqz a3, .LBB86_8 +; RV64ZVE32F-NEXT: beqz a3, .LBB86_7 ; RV64ZVE32F-NEXT: .LBB86_14: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a3, v8 +; RV64ZVE32F-NEXT: vmv.x.s a3, v9 ; RV64ZVE32F-NEXT: and a3, a3, a1 ; RV64ZVE32F-NEXT: slli a3, a3, 3 ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: fsd fa4, 0(a3) ; RV64ZVE32F-NEXT: andi a3, a2, 32 -; RV64ZVE32F-NEXT: bnez a3, .LBB86_9 -; RV64ZVE32F-NEXT: j .LBB86_10 +; RV64ZVE32F-NEXT: bnez a3, .LBB86_8 +; RV64ZVE32F-NEXT: j .LBB86_9 ; RV64ZVE32F-NEXT: .LBB86_15: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a3, v8 ; RV64ZVE32F-NEXT: and a3, a3, a1 @@ -9771,7 +9789,7 @@ ; RV64ZVE32F-NEXT: add a3, a0, a3 ; RV64ZVE32F-NEXT: fsd fa6, 0(a3) ; RV64ZVE32F-NEXT: andi a2, a2, -128 -; RV64ZVE32F-NEXT: beqz a2, .LBB86_12 +; RV64ZVE32F-NEXT: beqz a2, .LBB86_11 ; RV64ZVE32F-NEXT: .LBB86_16: # %cond.store13 ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v8 @@ -10645,28 +10663,30 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1 ; RV64ZVE32F-NEXT: vse8.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB91_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB91_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2 -; RV64ZVE32F-NEXT: vse8.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB91_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: bnez a2, .LBB91_25 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB91_26 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB91_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB91_27 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_8 +; RV64ZVE32F-NEXT: .LBB91_7: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 4 +; RV64ZVE32F-NEXT: vse8.v v11, (a2) ; RV64ZVE32F-NEXT: .LBB91_8: # %else8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB91_10 -; RV64ZVE32F-NEXT: .LBB91_9: # %cond.store9 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 @@ -10675,18 +10695,16 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 5 ; RV64ZVE32F-NEXT: vse8.v v11, (a2) ; RV64ZVE32F-NEXT: .LBB91_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 8 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB91_28 +; RV64ZVE32F-NEXT: bnez a2, .LBB91_27 ; RV64ZVE32F-NEXT: # %bb.11: # %else12 ; RV64ZVE32F-NEXT: andi a2, a1, 128 -; RV64ZVE32F-NEXT: bnez a2, .LBB91_29 +; RV64ZVE32F-NEXT: bnez a2, .LBB91_28 ; RV64ZVE32F-NEXT: .LBB91_12: # %else14 ; RV64ZVE32F-NEXT: andi a2, a1, 256 -; RV64ZVE32F-NEXT: bnez a2, .LBB91_30 +; RV64ZVE32F-NEXT: bnez a2, .LBB91_29 ; RV64ZVE32F-NEXT: .LBB91_13: # %else16 ; RV64ZVE32F-NEXT: andi a2, a1, 512 ; RV64ZVE32F-NEXT: beqz a2, .LBB91_15 @@ -10699,51 +10717,45 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 9 ; RV64ZVE32F-NEXT: vse8.v v10, (a2) ; RV64ZVE32F-NEXT: .LBB91_15: # %else18 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 1024 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB91_17 -; RV64ZVE32F-NEXT: # %bb.16: # %cond.store19 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 10 -; RV64ZVE32F-NEXT: vse8.v v11, (a2) -; RV64ZVE32F-NEXT: .LBB91_17: # %else20 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32F-NEXT: bnez a2, .LBB91_30 +; RV64ZVE32F-NEXT: # %bb.16: # %else20 ; RV64ZVE32F-NEXT: slli a2, a1, 52 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4 ; RV64ZVE32F-NEXT: bltz a2, .LBB91_31 -; RV64ZVE32F-NEXT: # %bb.18: # %else22 +; RV64ZVE32F-NEXT: .LBB91_17: # %else22 ; RV64ZVE32F-NEXT: slli a2, a1, 51 ; RV64ZVE32F-NEXT: bltz a2, .LBB91_32 -; RV64ZVE32F-NEXT: .LBB91_19: # %else24 +; RV64ZVE32F-NEXT: .LBB91_18: # %else24 ; RV64ZVE32F-NEXT: slli a2, a1, 50 -; RV64ZVE32F-NEXT: bgez a2, .LBB91_21 -; RV64ZVE32F-NEXT: .LBB91_20: # %cond.store25 +; RV64ZVE32F-NEXT: bgez a2, .LBB91_20 +; RV64ZVE32F-NEXT: .LBB91_19: # %cond.store25 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 13 -; RV64ZVE32F-NEXT: vse8.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB91_21: # %else26 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 13 +; RV64ZVE32F-NEXT: vse8.v v9, (a2) +; RV64ZVE32F-NEXT: .LBB91_20: # %else26 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: slli a2, a1, 49 -; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2 -; RV64ZVE32F-NEXT: bgez a2, .LBB91_23 -; RV64ZVE32F-NEXT: # %bb.22: # %cond.store27 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2 +; RV64ZVE32F-NEXT: bgez a2, .LBB91_22 +; RV64ZVE32F-NEXT: # %bb.21: # %cond.store27 ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 14 ; RV64ZVE32F-NEXT: vse8.v v10, (a2) -; RV64ZVE32F-NEXT: .LBB91_23: # %else28 +; RV64ZVE32F-NEXT: .LBB91_22: # %else28 ; RV64ZVE32F-NEXT: lui a2, 1048568 ; RV64ZVE32F-NEXT: and a1, a1, a2 -; RV64ZVE32F-NEXT: beqz a1, .LBB91_25 -; RV64ZVE32F-NEXT: # %bb.24: # %cond.store29 +; RV64ZVE32F-NEXT: beqz a1, .LBB91_24 +; RV64ZVE32F-NEXT: # %bb.23: # %cond.store29 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v9 @@ -10751,8 +10763,16 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 15 ; RV64ZVE32F-NEXT: vse8.v v8, (a0) -; RV64ZVE32F-NEXT: .LBB91_25: # %else30 +; RV64ZVE32F-NEXT: .LBB91_24: # %else30 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB91_25: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2 +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB91_6 ; RV64ZVE32F-NEXT: .LBB91_26: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 @@ -10762,17 +10782,9 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 3 ; RV64ZVE32F-NEXT: vse8.v v11, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB91_8 -; RV64ZVE32F-NEXT: .LBB91_27: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 4 -; RV64ZVE32F-NEXT: vse8.v v11, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB91_9 -; RV64ZVE32F-NEXT: j .LBB91_10 -; RV64ZVE32F-NEXT: .LBB91_28: # %cond.store11 +; RV64ZVE32F-NEXT: bnez a2, .LBB91_7 +; RV64ZVE32F-NEXT: j .LBB91_8 +; RV64ZVE32F-NEXT: .LBB91_27: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma @@ -10780,7 +10792,7 @@ ; RV64ZVE32F-NEXT: vse8.v v11, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 128 ; RV64ZVE32F-NEXT: beqz a2, .LBB91_12 -; RV64ZVE32F-NEXT: .LBB91_29: # %cond.store13 +; RV64ZVE32F-NEXT: .LBB91_28: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 @@ -10790,7 +10802,7 @@ ; RV64ZVE32F-NEXT: vse8.v v10, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 256 ; RV64ZVE32F-NEXT: beqz a2, .LBB91_13 -; RV64ZVE32F-NEXT: .LBB91_30: # %cond.store15 +; RV64ZVE32F-NEXT: .LBB91_29: # %cond.store15 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -10799,25 +10811,33 @@ ; RV64ZVE32F-NEXT: andi a2, a1, 512 ; RV64ZVE32F-NEXT: bnez a2, .LBB91_14 ; RV64ZVE32F-NEXT: j .LBB91_15 +; RV64ZVE32F-NEXT: .LBB91_30: # %cond.store19 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 10 +; RV64ZVE32F-NEXT: vse8.v v11, (a2) +; RV64ZVE32F-NEXT: slli a2, a1, 52 +; RV64ZVE32F-NEXT: bgez a2, .LBB91_17 ; RV64ZVE32F-NEXT: .LBB91_31: # %cond.store21 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v9 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 11 -; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 11 +; RV64ZVE32F-NEXT: vse8.v v9, (a2) ; RV64ZVE32F-NEXT: slli a2, a1, 51 -; RV64ZVE32F-NEXT: bgez a2, .LBB91_19 +; RV64ZVE32F-NEXT: bgez a2, .LBB91_18 ; RV64ZVE32F-NEXT: .LBB91_32: # %cond.store23 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v9 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 12 -; RV64ZVE32F-NEXT: vse8.v v10, (a2) +; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 12 +; RV64ZVE32F-NEXT: vse8.v v9, (a2) ; RV64ZVE32F-NEXT: slli a2, a1, 50 -; RV64ZVE32F-NEXT: bltz a2, .LBB91_20 -; RV64ZVE32F-NEXT: j .LBB91_21 +; RV64ZVE32F-NEXT: bltz a2, .LBB91_19 +; RV64ZVE32F-NEXT: j .LBB91_20 %ptrs = getelementptr inbounds i8, ptr %base, <16 x i8> %idxs call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %val, <16 x ptr> %ptrs, i32 1, <16 x i1> %m) ret void @@ -10876,48 +10896,48 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1 ; RV64ZVE32F-NEXT: vse8.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB92_4: # %else2 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v13, v10, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 4 ; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 2 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_6 -; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 2 -; RV64ZVE32F-NEXT: vse8.v v14, (a2) -; RV64ZVE32F-NEXT: .LBB92_6: # %else4 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: bnez a2, .LBB92_49 +; RV64ZVE32F-NEXT: # %bb.5: # %else4 ; RV64ZVE32F-NEXT: andi a2, a1, 8 -; RV64ZVE32F-NEXT: vslidedown.vi v13, v10, 4 ; RV64ZVE32F-NEXT: bnez a2, .LBB92_50 -; RV64ZVE32F-NEXT: # %bb.7: # %else6 +; RV64ZVE32F-NEXT: .LBB92_6: # %else6 ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: bnez a2, .LBB92_51 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_8 +; RV64ZVE32F-NEXT: .LBB92_7: # %cond.store7 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 4 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) ; RV64ZVE32F-NEXT: .LBB92_8: # %else8 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 32 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 8 ; RV64ZVE32F-NEXT: beqz a2, .LBB92_10 -; RV64ZVE32F-NEXT: .LBB92_9: # %cond.store9 +; RV64ZVE32F-NEXT: # %bb.9: # %cond.store9 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v13, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vslidedown.vi v14, v13, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v14 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 5 ; RV64ZVE32F-NEXT: vse8.v v14, (a2) ; RV64ZVE32F-NEXT: .LBB92_10: # %else10 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 8 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 64 ; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 2 -; RV64ZVE32F-NEXT: bnez a2, .LBB92_52 +; RV64ZVE32F-NEXT: bnez a2, .LBB92_51 ; RV64ZVE32F-NEXT: # %bb.11: # %else12 ; RV64ZVE32F-NEXT: andi a2, a1, 128 -; RV64ZVE32F-NEXT: bnez a2, .LBB92_53 +; RV64ZVE32F-NEXT: bnez a2, .LBB92_52 ; RV64ZVE32F-NEXT: .LBB92_12: # %else14 ; RV64ZVE32F-NEXT: andi a2, a1, 256 -; RV64ZVE32F-NEXT: bnez a2, .LBB92_54 +; RV64ZVE32F-NEXT: bnez a2, .LBB92_53 ; RV64ZVE32F-NEXT: .LBB92_13: # %else16 ; RV64ZVE32F-NEXT: andi a2, a1, 512 ; RV64ZVE32F-NEXT: beqz a2, .LBB92_15 @@ -10930,25 +10950,25 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 9 ; RV64ZVE32F-NEXT: vse8.v v14, (a2) ; RV64ZVE32F-NEXT: .LBB92_15: # %else18 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: andi a2, a1, 1024 -; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 2 ; RV64ZVE32F-NEXT: beqz a2, .LBB92_17 ; RV64ZVE32F-NEXT: # %bb.16: # %cond.store19 -; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 10 ; RV64ZVE32F-NEXT: vse8.v v14, (a2) ; RV64ZVE32F-NEXT: .LBB92_17: # %else20 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; RV64ZVE32F-NEXT: slli a2, a1, 52 -; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 4 ; RV64ZVE32F-NEXT: bgez a2, .LBB92_19 ; RV64ZVE32F-NEXT: # %bb.18: # %cond.store21 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v13 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 11 @@ -10959,7 +10979,7 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 16 ; RV64ZVE32F-NEXT: bgez a2, .LBB92_21 ; RV64ZVE32F-NEXT: # %bb.20: # %cond.store23 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: vmv.x.s a2, v13 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 12 @@ -10969,7 +10989,7 @@ ; RV64ZVE32F-NEXT: bgez a2, .LBB92_23 ; RV64ZVE32F-NEXT: # %bb.22: # %cond.store25 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v12, 1 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v13, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma @@ -10978,14 +10998,14 @@ ; RV64ZVE32F-NEXT: .LBB92_23: # %else26 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: slli a2, a1, 49 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v12, 2 -; RV64ZVE32F-NEXT: bltz a2, .LBB92_55 +; RV64ZVE32F-NEXT: vslidedown.vi v11, v13, 2 +; RV64ZVE32F-NEXT: bltz a2, .LBB92_54 ; RV64ZVE32F-NEXT: # %bb.24: # %else28 ; RV64ZVE32F-NEXT: slli a2, a1, 48 -; RV64ZVE32F-NEXT: bltz a2, .LBB92_56 +; RV64ZVE32F-NEXT: bltz a2, .LBB92_55 ; RV64ZVE32F-NEXT: .LBB92_25: # %else30 ; RV64ZVE32F-NEXT: slli a2, a1, 47 -; RV64ZVE32F-NEXT: bltz a2, .LBB92_57 +; RV64ZVE32F-NEXT: bltz a2, .LBB92_56 ; RV64ZVE32F-NEXT: .LBB92_26: # %else32 ; RV64ZVE32F-NEXT: slli a2, a1, 46 ; RV64ZVE32F-NEXT: bgez a2, .LBB92_28 @@ -10998,28 +11018,30 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 17 ; RV64ZVE32F-NEXT: vse8.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB92_28: # %else34 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: slli a2, a1, 45 ; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 2 -; RV64ZVE32F-NEXT: bgez a2, .LBB92_30 -; RV64ZVE32F-NEXT: # %bb.29: # %cond.store35 -; RV64ZVE32F-NEXT: vmv.x.s a2, v12 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 18 -; RV64ZVE32F-NEXT: vse8.v v14, (a2) -; RV64ZVE32F-NEXT: .LBB92_30: # %else36 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: bltz a2, .LBB92_57 +; RV64ZVE32F-NEXT: # %bb.29: # %else36 ; RV64ZVE32F-NEXT: slli a2, a1, 44 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 ; RV64ZVE32F-NEXT: bltz a2, .LBB92_58 -; RV64ZVE32F-NEXT: # %bb.31: # %else38 +; RV64ZVE32F-NEXT: .LBB92_30: # %else38 ; RV64ZVE32F-NEXT: slli a2, a1, 43 -; RV64ZVE32F-NEXT: bltz a2, .LBB92_59 +; RV64ZVE32F-NEXT: bgez a2, .LBB92_32 +; RV64ZVE32F-NEXT: .LBB92_31: # %cond.store39 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 20 +; RV64ZVE32F-NEXT: vse8.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB92_32: # %else40 +; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma ; RV64ZVE32F-NEXT: slli a2, a1, 42 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 8 ; RV64ZVE32F-NEXT: bgez a2, .LBB92_34 -; RV64ZVE32F-NEXT: .LBB92_33: # %cond.store41 +; RV64ZVE32F-NEXT: # %bb.33: # %cond.store41 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v11, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v12 @@ -11028,18 +11050,16 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 21 ; RV64ZVE32F-NEXT: vse8.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB92_34: # %else42 -; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 8 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: slli a2, a1, 41 ; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 2 -; RV64ZVE32F-NEXT: bltz a2, .LBB92_60 +; RV64ZVE32F-NEXT: bltz a2, .LBB92_59 ; RV64ZVE32F-NEXT: # %bb.35: # %else44 ; RV64ZVE32F-NEXT: slli a2, a1, 40 -; RV64ZVE32F-NEXT: bltz a2, .LBB92_61 +; RV64ZVE32F-NEXT: bltz a2, .LBB92_60 ; RV64ZVE32F-NEXT: .LBB92_36: # %else46 ; RV64ZVE32F-NEXT: slli a2, a1, 39 -; RV64ZVE32F-NEXT: bltz a2, .LBB92_62 +; RV64ZVE32F-NEXT: bltz a2, .LBB92_61 ; RV64ZVE32F-NEXT: .LBB92_37: # %else48 ; RV64ZVE32F-NEXT: slli a2, a1, 38 ; RV64ZVE32F-NEXT: bgez a2, .LBB92_39 @@ -11052,51 +11072,45 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 25 ; RV64ZVE32F-NEXT: vse8.v v12, (a2) ; RV64ZVE32F-NEXT: .LBB92_39: # %else50 +; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: slli a2, a1, 37 -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2 -; RV64ZVE32F-NEXT: bgez a2, .LBB92_41 -; RV64ZVE32F-NEXT: # %bb.40: # %cond.store51 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 26 -; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB92_41: # %else52 -; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 +; RV64ZVE32F-NEXT: bltz a2, .LBB92_62 +; RV64ZVE32F-NEXT: # %bb.40: # %else52 ; RV64ZVE32F-NEXT: slli a2, a1, 36 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4 ; RV64ZVE32F-NEXT: bltz a2, .LBB92_63 -; RV64ZVE32F-NEXT: # %bb.42: # %else54 +; RV64ZVE32F-NEXT: .LBB92_41: # %else54 ; RV64ZVE32F-NEXT: slli a2, a1, 35 ; RV64ZVE32F-NEXT: bltz a2, .LBB92_64 -; RV64ZVE32F-NEXT: .LBB92_43: # %else56 +; RV64ZVE32F-NEXT: .LBB92_42: # %else56 ; RV64ZVE32F-NEXT: slli a2, a1, 34 -; RV64ZVE32F-NEXT: bgez a2, .LBB92_45 -; RV64ZVE32F-NEXT: .LBB92_44: # %cond.store57 +; RV64ZVE32F-NEXT: bgez a2, .LBB92_44 +; RV64ZVE32F-NEXT: .LBB92_43: # %cond.store57 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 29 ; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB92_45: # %else58 +; RV64ZVE32F-NEXT: .LBB92_44: # %else58 ; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: slli a2, a1, 33 -; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2 -; RV64ZVE32F-NEXT: bgez a2, .LBB92_47 -; RV64ZVE32F-NEXT: # %bb.46: # %cond.store59 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2 +; RV64ZVE32F-NEXT: bgez a2, .LBB92_46 +; RV64ZVE32F-NEXT: # %bb.45: # %cond.store59 ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 30 ; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: .LBB92_47: # %else60 +; RV64ZVE32F-NEXT: .LBB92_46: # %else60 ; RV64ZVE32F-NEXT: lui a2, 524288 ; RV64ZVE32F-NEXT: and a1, a1, a2 -; RV64ZVE32F-NEXT: beqz a1, .LBB92_49 -; RV64ZVE32F-NEXT: # %bb.48: # %cond.store61 +; RV64ZVE32F-NEXT: beqz a1, .LBB92_48 +; RV64ZVE32F-NEXT: # %bb.47: # %cond.store61 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 ; RV64ZVE32F-NEXT: vmv.x.s a1, v10 @@ -11104,8 +11118,16 @@ ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 31 ; RV64ZVE32F-NEXT: vse8.v v8, (a0) -; RV64ZVE32F-NEXT: .LBB92_49: # %else62 +; RV64ZVE32F-NEXT: .LBB92_48: # %else62 ; RV64ZVE32F-NEXT: ret +; RV64ZVE32F-NEXT: .LBB92_49: # %cond.store3 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 2 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: andi a2, a1, 8 +; RV64ZVE32F-NEXT: beqz a2, .LBB92_6 ; RV64ZVE32F-NEXT: .LBB92_50: # %cond.store5 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1 @@ -11115,17 +11137,9 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 3 ; RV64ZVE32F-NEXT: vse8.v v14, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 16 -; RV64ZVE32F-NEXT: beqz a2, .LBB92_8 -; RV64ZVE32F-NEXT: .LBB92_51: # %cond.store7 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v13 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 4 -; RV64ZVE32F-NEXT: vse8.v v14, (a2) -; RV64ZVE32F-NEXT: andi a2, a1, 32 -; RV64ZVE32F-NEXT: bnez a2, .LBB92_9 -; RV64ZVE32F-NEXT: j .LBB92_10 -; RV64ZVE32F-NEXT: .LBB92_52: # %cond.store11 +; RV64ZVE32F-NEXT: bnez a2, .LBB92_7 +; RV64ZVE32F-NEXT: j .LBB92_8 +; RV64ZVE32F-NEXT: .LBB92_51: # %cond.store11 ; RV64ZVE32F-NEXT: vmv.x.s a2, v13 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma @@ -11133,7 +11147,7 @@ ; RV64ZVE32F-NEXT: vse8.v v14, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 128 ; RV64ZVE32F-NEXT: beqz a2, .LBB92_12 -; RV64ZVE32F-NEXT: .LBB92_53: # %cond.store13 +; RV64ZVE32F-NEXT: .LBB92_52: # %cond.store13 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v13 @@ -11143,7 +11157,7 @@ ; RV64ZVE32F-NEXT: vse8.v v14, (a2) ; RV64ZVE32F-NEXT: andi a2, a1, 256 ; RV64ZVE32F-NEXT: beqz a2, .LBB92_13 -; RV64ZVE32F-NEXT: .LBB92_54: # %cond.store15 +; RV64ZVE32F-NEXT: .LBB92_53: # %cond.store15 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v12 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -11152,7 +11166,7 @@ ; RV64ZVE32F-NEXT: andi a2, a1, 512 ; RV64ZVE32F-NEXT: bnez a2, .LBB92_14 ; RV64ZVE32F-NEXT: j .LBB92_15 -; RV64ZVE32F-NEXT: .LBB92_55: # %cond.store27 +; RV64ZVE32F-NEXT: .LBB92_54: # %cond.store27 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma @@ -11160,7 +11174,7 @@ ; RV64ZVE32F-NEXT: vse8.v v12, (a2) ; RV64ZVE32F-NEXT: slli a2, a1, 48 ; RV64ZVE32F-NEXT: bgez a2, .LBB92_25 -; RV64ZVE32F-NEXT: .LBB92_56: # %cond.store29 +; RV64ZVE32F-NEXT: .LBB92_55: # %cond.store29 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 @@ -11170,7 +11184,7 @@ ; RV64ZVE32F-NEXT: vse8.v v12, (a2) ; RV64ZVE32F-NEXT: slli a2, a1, 47 ; RV64ZVE32F-NEXT: bgez a2, .LBB92_26 -; RV64ZVE32F-NEXT: .LBB92_57: # %cond.store31 +; RV64ZVE32F-NEXT: .LBB92_56: # %cond.store31 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -11179,6 +11193,14 @@ ; RV64ZVE32F-NEXT: slli a2, a1, 46 ; RV64ZVE32F-NEXT: bltz a2, .LBB92_27 ; RV64ZVE32F-NEXT: j .LBB92_28 +; RV64ZVE32F-NEXT: .LBB92_57: # %cond.store35 +; RV64ZVE32F-NEXT: vmv.x.s a2, v12 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 18 +; RV64ZVE32F-NEXT: vse8.v v14, (a2) +; RV64ZVE32F-NEXT: slli a2, a1, 44 +; RV64ZVE32F-NEXT: bgez a2, .LBB92_30 ; RV64ZVE32F-NEXT: .LBB92_58: # %cond.store37 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1 @@ -11188,17 +11210,9 @@ ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 19 ; RV64ZVE32F-NEXT: vse8.v v12, (a2) ; RV64ZVE32F-NEXT: slli a2, a1, 43 -; RV64ZVE32F-NEXT: bgez a2, .LBB92_32 -; RV64ZVE32F-NEXT: .LBB92_59: # %cond.store39 -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 -; RV64ZVE32F-NEXT: add a2, a0, a2 -; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 20 -; RV64ZVE32F-NEXT: vse8.v v12, (a2) -; RV64ZVE32F-NEXT: slli a2, a1, 42 -; RV64ZVE32F-NEXT: bltz a2, .LBB92_33 -; RV64ZVE32F-NEXT: j .LBB92_34 -; RV64ZVE32F-NEXT: .LBB92_60: # %cond.store43 +; RV64ZVE32F-NEXT: bltz a2, .LBB92_31 +; RV64ZVE32F-NEXT: j .LBB92_32 +; RV64ZVE32F-NEXT: .LBB92_59: # %cond.store43 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma @@ -11206,7 +11220,7 @@ ; RV64ZVE32F-NEXT: vse8.v v12, (a2) ; RV64ZVE32F-NEXT: slli a2, a1, 40 ; RV64ZVE32F-NEXT: bgez a2, .LBB92_36 -; RV64ZVE32F-NEXT: .LBB92_61: # %cond.store45 +; RV64ZVE32F-NEXT: .LBB92_60: # %cond.store45 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 ; RV64ZVE32F-NEXT: vmv.x.s a2, v11 @@ -11216,7 +11230,7 @@ ; RV64ZVE32F-NEXT: vse8.v v12, (a2) ; RV64ZVE32F-NEXT: slli a2, a1, 39 ; RV64ZVE32F-NEXT: bgez a2, .LBB92_37 -; RV64ZVE32F-NEXT: .LBB92_62: # %cond.store47 +; RV64ZVE32F-NEXT: .LBB92_61: # %cond.store47 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma ; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 @@ -11225,25 +11239,33 @@ ; RV64ZVE32F-NEXT: slli a2, a1, 38 ; RV64ZVE32F-NEXT: bltz a2, .LBB92_38 ; RV64ZVE32F-NEXT: j .LBB92_39 +; RV64ZVE32F-NEXT: .LBB92_62: # %cond.store51 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: add a2, a0, a2 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma +; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 26 +; RV64ZVE32F-NEXT: vse8.v v12, (a2) +; RV64ZVE32F-NEXT: slli a2, a1, 36 +; RV64ZVE32F-NEXT: bgez a2, .LBB92_41 ; RV64ZVE32F-NEXT: .LBB92_63: # %cond.store53 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1 -; RV64ZVE32F-NEXT: vmv.x.s a2, v11 +; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1 +; RV64ZVE32F-NEXT: vmv.x.s a2, v10 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 27 ; RV64ZVE32F-NEXT: vse8.v v12, (a2) ; RV64ZVE32F-NEXT: slli a2, a1, 35 -; RV64ZVE32F-NEXT: bgez a2, .LBB92_43 +; RV64ZVE32F-NEXT: bgez a2, .LBB92_42 ; RV64ZVE32F-NEXT: .LBB92_64: # %cond.store55 ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma -; RV64ZVE32F-NEXT: vmv.x.s a2, v10 +; RV64ZVE32F-NEXT: vmv.x.s a2, v11 ; RV64ZVE32F-NEXT: add a2, a0, a2 ; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 28 ; RV64ZVE32F-NEXT: vse8.v v12, (a2) ; RV64ZVE32F-NEXT: slli a2, a1, 34 -; RV64ZVE32F-NEXT: bltz a2, .LBB92_44 -; RV64ZVE32F-NEXT: j .LBB92_45 +; RV64ZVE32F-NEXT: bltz a2, .LBB92_43 +; RV64ZVE32F-NEXT: j .LBB92_44 %ptrs = getelementptr inbounds i8, ptr %base, <32 x i8> %idxs call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %val, <32 x ptr> %ptrs, i32 1, <32 x i1> %m) ret void diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll @@ -541,57 +541,6 @@ declare <128 x i1> @llvm.vp.fcmp.v128f16(<128 x half>, <128 x half>, metadata, <128 x i1>, i32) define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: fcmp_oeq_vv_v128f16: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: vmv1r.v v24, v0 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: li a1, 64 -; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: addi a3, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: addi a0, a2, -64 -; CHECK-NEXT: sltu a3, a2, a0 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a0, a3, a0 -; CHECK-NEXT: vslidedown.vi v0, v0, 8 -; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vmfeq.vv v1, v16, v8, v0.t -; CHECK-NEXT: bltu a2, a1, .LBB43_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 64 -; CHECK-NEXT: .LBB43_2: -; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v16, v8, v24, v0.t -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vslideup.vi v16, v1, 8 -; CHECK-NEXT: vmv.v.v v0, v16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: ret %v = call <128 x i1> @llvm.vp.fcmp.v128f16(<128 x half> %va, <128 x half> %vb, metadata !"oeq", <128 x i1> %m, i32 %evl) ret <128 x i1> %v } @@ -1160,44 +1109,45 @@ ; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v24, (a1) ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; CHECK-NEXT: vslidedown.vi v1, v0, 2 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v24, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: li a1, 16 +; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: bltu a2, a1, .LBB87_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: .LBB87_2: +; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmfeq.vv v2, v8, v24, v0.t ; CHECK-NEXT: addi a0, a2, -16 ; CHECK-NEXT: sltu a1, a2, a0 ; CHECK-NEXT: addi a1, a1, -1 ; CHECK-NEXT: and a0, a1, a0 -; CHECK-NEXT: vslidedown.vi v0, v0, 2 ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: li a0, 16 -; CHECK-NEXT: vmfeq.vv v1, v16, v8, v0.t -; CHECK-NEXT: bltu a2, a0, .LBB87_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: .LBB87_2: -; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: vmv1r.v v0, v1 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v16, v8, v24, v0.t +; CHECK-NEXT: vmfeq.vv v24, v16, v8, v0.t ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vslideup.vi v16, v1, 2 -; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vslideup.vi v2, v24, 2 +; CHECK-NEXT: vmv1r.v v0, v2 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll @@ -1315,57 +1315,109 @@ declare <64 x i1> @llvm.vp.icmp.v64i32(<64 x i32>, <64 x i32>, metadata, <64 x i1>, i32) define <64 x i1> @icmp_eq_vv_v64i32(<64 x i32> %va, <64 x i32> %vb, <64 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: icmp_eq_vv_v64i32: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: vmv1r.v v24, v0 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: addi a3, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill -; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; CHECK-NEXT: addi a0, a2, -32 -; CHECK-NEXT: sltu a3, a2, a0 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a0, a3, a0 -; CHECK-NEXT: vslidedown.vi v0, v0, 4 -; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; CHECK-NEXT: vmseq.vv v1, v16, v8, v0.t -; CHECK-NEXT: bltu a2, a1, .LBB99_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: .LBB99_2: -; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmseq.vv v16, v8, v24, v0.t -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v16, v1, 4 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: ret +; RV32-LABEL: icmp_eq_vv_v64i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV32-NEXT: addi a1, a0, 128 +; RV32-NEXT: li a3, 32 +; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; RV32-NEXT: vle32.v v24, (a1) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vle32.v v24, (a0) +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV32-NEXT: vslidedown.vi v1, v0, 4 +; RV32-NEXT: mv a0, a2 +; RV32-NEXT: bltu a2, a3, .LBB99_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a0, 32 +; RV32-NEXT: .LBB99_2: +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vmseq.vv v2, v8, v24, v0.t +; RV32-NEXT: addi a0, a2, -32 +; RV32-NEXT: sltu a1, a2, a0 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: and a0, a1, a0 +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vmseq.vv v24, v16, v8, v0.t +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vslideup.vi v2, v24, 4 +; RV32-NEXT: vmv1r.v v0, v2 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: icmp_eq_vv_v64i32: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV64-NEXT: addi a1, a0, 128 +; RV64-NEXT: li a3, 32 +; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma +; RV64-NEXT: vle32.v v24, (a1) +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vle32.v v24, (a0) +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: vslidedown.vi v1, v0, 4 +; RV64-NEXT: bltu a2, a3, .LBB99_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a0, 32 +; RV64-NEXT: .LBB99_2: +; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vmseq.vv v2, v8, v24, v0.t +; RV64-NEXT: addi a0, a2, -32 +; RV64-NEXT: sltu a1, a2, a0 +; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: and a0, a1, a0 +; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV64-NEXT: vmv1r.v v0, v1 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vmseq.vv v24, v16, v8, v0.t +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vslideup.vi v2, v24, 4 +; RV64-NEXT: vmv1r.v v0, v2 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret %v = call <64 x i1> @llvm.vp.icmp.v64i32(<64 x i32> %va, <64 x i32> %vb, metadata !"eq", <64 x i1> %m, i32 %evl) ret <64 x i1> %v } @@ -1373,26 +1425,26 @@ define <64 x i1> @icmp_eq_vx_v64i32(<64 x i32> %va, i32 %b, <64 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vx_v64i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v0, v0, 4 -; CHECK-NEXT: addi a2, a1, -32 -; CHECK-NEXT: sltu a3, a1, a2 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a2, a3, a2 -; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vmseq.vx v25, v16, a0, v0.t -; CHECK-NEXT: bltu a1, a2, .LBB100_2 +; CHECK-NEXT: li a3, 32 +; CHECK-NEXT: vslidedown.vi v24, v0, 4 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: bltu a1, a3, .LBB100_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: .LBB100_2: +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vmseq.vx v25, v8, a0, v0.t +; CHECK-NEXT: addi a2, a1, -32 +; CHECK-NEXT: sltu a1, a1, a2 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: and a1, a1, a2 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: vmseq.vx v16, v8, a0, v0.t +; CHECK-NEXT: vmseq.vx v8, v16, a0, v0.t ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v16, v25, 4 -; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vslideup.vi v25, v8, 4 +; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: ret %elt.head = insertelement <64 x i32> poison, i32 %b, i32 0 %vb = shufflevector <64 x i32> %elt.head, <64 x i32> poison, <64 x i32> zeroinitializer @@ -1403,26 +1455,26 @@ define <64 x i1> @icmp_eq_vx_swap_v64i32(<64 x i32> %va, i32 %b, <64 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: icmp_eq_vx_swap_v64i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vmv1r.v v24, v0 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v0, v0, 4 -; CHECK-NEXT: addi a2, a1, -32 -; CHECK-NEXT: sltu a3, a1, a2 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a2, a3, a2 -; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vmseq.vx v25, v16, a0, v0.t -; CHECK-NEXT: bltu a1, a2, .LBB101_2 +; CHECK-NEXT: li a3, 32 +; CHECK-NEXT: vslidedown.vi v24, v0, 4 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: bltu a1, a3, .LBB101_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: .LBB101_2: +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vmseq.vx v25, v8, a0, v0.t +; CHECK-NEXT: addi a2, a1, -32 +; CHECK-NEXT: sltu a1, a1, a2 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: and a1, a1, a2 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: vmseq.vx v16, v8, a0, v0.t +; CHECK-NEXT: vmseq.vx v8, v16, a0, v0.t ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v16, v25, 4 -; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vslideup.vi v25, v8, 4 +; CHECK-NEXT: vmv1r.v v0, v25 ; CHECK-NEXT: ret %elt.head = insertelement <64 x i32> poison, i32 %b, i32 0 %vb = shufflevector <64 x i32> %elt.head, <64 x i32> poison, <64 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll @@ -91,26 +91,37 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vslidedown.vx v8, v0, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: vfwadd.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfwadd.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwadd.vv v8, v16, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -192,24 +203,36 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vle32.v v0, (a1) +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v8, 16 +; CHECK-NEXT: vslidedown.vi v8, v0, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: vfwadd.vv v8, v16, v24 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfwadd.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwadd.vv v8, v16, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -375,10 +398,10 @@ ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; CHECK-NEXT: vslidedown.vi v0, v24, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vfmv.v.f v8, fa0 -; CHECK-NEXT: vfwcvt.f.f.v v16, v8 -; CHECK-NEXT: vfwadd.wv v8, v16, v24 -; CHECK-NEXT: vfwadd.wv v16, v16, v0 +; CHECK-NEXT: vfmv.v.f v16, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v8, v16 +; CHECK-NEXT: vfwadd.wv v16, v8, v0 +; CHECK-NEXT: vfwadd.wv v8, v8, v24 ; CHECK-NEXT: ret %a = load <32 x float>, ptr %x %b = insertelement <32 x float> poison, float %y, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll @@ -91,26 +91,37 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vslidedown.vx v8, v0, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: vfwmul.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfwmul.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwmul.vv v8, v16, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -192,24 +203,36 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vle32.v v0, (a1) +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v8, 16 +; CHECK-NEXT: vslidedown.vi v8, v0, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: vfwmul.vv v8, v16, v24 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfwmul.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwmul.vv v8, v16, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -371,17 +394,18 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v8, v16 ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v8, 16 +; CHECK-NEXT: vslidedown.vi v16, v16, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vfwcvt.f.f.v v24, v16 -; CHECK-NEXT: vfwcvt.f.f.v v16, v8 -; CHECK-NEXT: vfmv.v.f v8, fa0 -; CHECK-NEXT: vfwcvt.f.f.v v0, v8 +; CHECK-NEXT: vfmv.v.f v16, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v0, v16 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfmul.vv v8, v16, v0 ; CHECK-NEXT: vfmul.vv v16, v24, v0 +; CHECK-NEXT: vfmul.vv v8, v8, v0 ; CHECK-NEXT: ret %a = load <32 x float>, ptr %x %b = insertelement <32 x float> poison, float %y, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll @@ -91,26 +91,37 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vslidedown.vx v8, v0, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: vfwsub.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfwsub.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwsub.vv v8, v16, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -192,24 +203,36 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vle32.v v0, (a1) +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v8, 16 +; CHECK-NEXT: vslidedown.vi v8, v0, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: vfwsub.vv v8, v16, v24 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vfwsub.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfwsub.vv v8, v16, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -371,17 +394,18 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v8, v16 ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v8, 16 +; CHECK-NEXT: vslidedown.vi v16, v16, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vfwcvt.f.f.v v24, v16 -; CHECK-NEXT: vfwcvt.f.f.v v16, v8 -; CHECK-NEXT: vfmv.v.f v8, fa0 -; CHECK-NEXT: vfwcvt.f.f.v v0, v8 +; CHECK-NEXT: vfmv.v.f v16, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v0, v16 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfsub.vv v8, v16, v0 ; CHECK-NEXT: vfsub.vv v16, v24, v0 +; CHECK-NEXT: vfsub.vv v8, v8, v0 ; CHECK-NEXT: ret %a = load <32 x float>, ptr %x %b = insertelement <32 x float> poison, float %y, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll @@ -2468,31 +2468,32 @@ define <32 x double> @vpgather_baseidx_v32f64(ptr %base, <32 x i64> %idxs, <32 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_v32f64: ; RV32: # %bb.0: +; RV32-NEXT: vmv1r.v v1, v0 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vnsrl.wi v24, v16, 0 ; RV32-NEXT: vnsrl.wi v16, v8, 0 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vslideup.vi v16, v24, 16 -; RV32-NEXT: li a3, 16 -; RV32-NEXT: vsll.vi v16, v16, 3 -; RV32-NEXT: mv a2, a1 -; RV32-NEXT: bltu a1, a3, .LBB96_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a2, 16 -; RV32-NEXT: .LBB96_2: -; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t +; RV32-NEXT: vsll.vi v24, v16, 3 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v24, v16, 16 +; RV32-NEXT: vslidedown.vi v8, v24, 16 ; RV32-NEXT: addi a2, a1, -16 -; RV32-NEXT: sltu a1, a1, a2 -; RV32-NEXT: addi a1, a1, -1 -; RV32-NEXT: and a1, a1, a2 +; RV32-NEXT: sltu a3, a1, a2 +; RV32-NEXT: addi a3, a3, -1 +; RV32-NEXT: and a2, a3, a2 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t +; RV32-NEXT: li a2, 16 +; RV32-NEXT: bltu a1, a2, .LBB96_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a1, 16 +; RV32-NEXT: .LBB96_2: ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_v32f64: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll @@ -250,25 +250,36 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vle8.v v0, (a1) ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vslidedown.vx v8, v0, a0 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: vwadd.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwadd.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwadd.vv v8, v16, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -285,25 +296,36 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vslidedown.vx v8, v0, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: vwadd.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwadd.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwadd.vv v8, v16, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -320,23 +342,35 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vle32.v v0, (a1) +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v8, 16 +; CHECK-NEXT: vslidedown.vi v8, v0, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: vwadd.vv v8, v16, v24 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwadd.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwadd.vv v8, v16, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll @@ -250,25 +250,36 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vle8.v v0, (a1) ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vslidedown.vx v8, v0, a0 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: vwaddu.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwaddu.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwaddu.vv v8, v16, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -285,25 +296,36 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vslidedown.vx v8, v0, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: vwaddu.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwaddu.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwaddu.vv v8, v16, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -320,23 +342,35 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vle32.v v0, (a1) +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v8, 16 +; CHECK-NEXT: vslidedown.vi v8, v0, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: vwaddu.vv v8, v16, v24 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwaddu.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwaddu.vv v8, v16, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll @@ -275,26 +275,37 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vle8.v v0, (a1) ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vslidedown.vx v8, v0, a0 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: vwmul.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwmul.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwmul.vv v8, v16, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -312,26 +323,37 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vslidedown.vx v8, v0, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: vwmul.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwmul.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwmul.vv v8, v16, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -349,24 +371,36 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vle32.v v0, (a1) +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v8, 16 +; CHECK-NEXT: vslidedown.vi v8, v0, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: vwmul.vv v8, v16, v24 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwmul.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwmul.vv v8, v16, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll @@ -267,26 +267,37 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vle8.v v0, (a1) ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vslidedown.vx v8, v0, a0 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: vwmulsu.vv v8, v24, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwmulsu.vv v16, v0, v24 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwmulsu.vv v8, v0, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -304,26 +315,37 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vslidedown.vx v8, v0, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: vwmulsu.vv v8, v24, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwmulsu.vv v16, v0, v24 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwmulsu.vv v8, v0, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -341,24 +363,36 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vle32.v v0, (a1) +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v8, 16 +; CHECK-NEXT: vslidedown.vi v8, v0, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: vwmulsu.vv v8, v24, v16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwmulsu.vv v16, v0, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwmulsu.vv v8, v0, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll @@ -251,26 +251,37 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vle8.v v0, (a1) ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vslidedown.vx v8, v0, a0 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: vwmulu.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwmulu.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwmulu.vv v8, v16, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -288,26 +299,37 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vslidedown.vx v8, v0, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: vwmulu.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwmulu.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwmulu.vv v8, v16, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -325,24 +347,36 @@ ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vle32.v v0, (a1) +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v8, 16 +; CHECK-NEXT: vslidedown.vi v8, v0, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: vwmulu.vv v8, v16, v24 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwmulu.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwmulu.vv v8, v16, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll @@ -250,25 +250,36 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vle8.v v0, (a1) ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vslidedown.vx v8, v0, a0 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: vwsub.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwsub.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwsub.vv v8, v16, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -285,25 +296,36 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vslidedown.vx v8, v0, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: vwsub.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwsub.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwsub.vv v8, v16, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -320,23 +342,35 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vle32.v v0, (a1) +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v8, 16 +; CHECK-NEXT: vslidedown.vi v8, v0, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: vwsub.vv v8, v16, v24 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwsub.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwsub.vv v8, v16, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll @@ -250,25 +250,36 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vle8.v v0, (a1) ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vslidedown.vx v8, v0, a0 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: vwsubu.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwsubu.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwsubu.vv v8, v16, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -285,25 +296,36 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vle16.v v0, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vslidedown.vx v8, v0, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: vwsubu.vv v8, v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwsubu.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwsubu.vv v8, v16, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret @@ -320,23 +342,35 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: sub sp, sp, a2 ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) -; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vle32.v v0, (a1) +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v8, 16 +; CHECK-NEXT: vslidedown.vi v8, v0, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vmv4r.v v24, v8 ; CHECK-NEXT: vwsubu.vv v8, v16, v24 -; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vwsubu.vv v16, v24, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vwsubu.vv v8, v16, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: addi sp, sp, 16 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll --- a/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll @@ -258,16 +258,16 @@ ; RV32-NEXT: vsetvli zero, a1, e16, m4, ta, ma ; RV32-NEXT: vle16.v v16, (a0) ; RV32-NEXT: vmv2r.v v20, v10 -; RV32-NEXT: vrgather.vv v12, v8, v16 -; RV32-NEXT: vid.v v8 -; RV32-NEXT: vrsub.vi v8, v8, 15 +; RV32-NEXT: vmv2r.v v12, v8 +; RV32-NEXT: vrgather.vv v8, v12, v16 +; RV32-NEXT: vid.v v12 +; RV32-NEXT: vrsub.vi v12, v12, 15 ; RV32-NEXT: lui a0, 16 ; RV32-NEXT: addi a0, a0, -1 ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; RV32-NEXT: vmv.v.x v0, a0 ; RV32-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; RV32-NEXT: vrgather.vv v12, v20, v8, v0.t -; RV32-NEXT: vmv.v.v v8, v12 +; RV32-NEXT: vrgather.vv v8, v20, v12, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: v16i16_2: @@ -278,16 +278,16 @@ ; RV64-NEXT: vsetvli zero, a1, e16, m4, ta, ma ; RV64-NEXT: vle16.v v16, (a0) ; RV64-NEXT: vmv2r.v v20, v10 -; RV64-NEXT: vrgather.vv v12, v8, v16 -; RV64-NEXT: vid.v v8 -; RV64-NEXT: vrsub.vi v8, v8, 15 +; RV64-NEXT: vmv2r.v v12, v8 +; RV64-NEXT: vrgather.vv v8, v12, v16 +; RV64-NEXT: vid.v v12 +; RV64-NEXT: vrsub.vi v12, v12, 15 ; RV64-NEXT: lui a0, 16 ; RV64-NEXT: addiw a0, a0, -1 ; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; RV64-NEXT: vmv.v.x v0, a0 ; RV64-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; RV64-NEXT: vrgather.vv v12, v20, v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 +; RV64-NEXT: vrgather.vv v8, v20, v12, v0.t ; RV64-NEXT: ret %v32i16 = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> ret <32 x i16> %v32i16 diff --git a/llvm/test/CodeGen/RISCV/rvv/splats-with-mixed-vl.ll b/llvm/test/CodeGen/RISCV/rvv/splats-with-mixed-vl.ll --- a/llvm/test/CodeGen/RISCV/rvv/splats-with-mixed-vl.ll +++ b/llvm/test/CodeGen/RISCV/rvv/splats-with-mixed-vl.ll @@ -159,8 +159,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma ; CHECK-NEXT: vmv.v.x v8, a2 +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a2 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vse32.v v9, (a0) ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 %v, i32 0 %splat = shufflevector %elt.head, poison, zeroinitializer @@ -175,8 +177,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a3, zero, e32, mf2, ta, ma ; CHECK-NEXT: vmv.v.x v8, a2 +; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a2 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vse32.v v9, (a0) ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-NEXT: vse32.v v8, (a1) ; CHECK-NEXT: ret @@ -196,6 +200,8 @@ ; CHECK-NEXT: vsetvli a3, zero, e32, mf2, ta, ma ; CHECK-NEXT: vmv.v.x v8, a2 ; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.x v8, a2 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vse32.v v8, (a1) ; CHECK-NEXT: ret @@ -213,9 +219,12 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a3, zero, e32, mf2, ta, ma ; CHECK-NEXT: vmv.v.x v8, a2 +; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.x v9, a2 +; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, ma ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vse32.v v8, (a1) +; CHECK-NEXT: vse32.v v9, (a1) ; CHECK-NEXT: ret %elt.head = insertelement poison, i32 %v, i32 0 %splat = shufflevector %elt.head, poison, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll @@ -107,10 +107,10 @@ define {<2 x i64>, <2 x i64>} @vector_deinterleave_v2i64_v4i64(<4 x i64> %vec) { ; CHECK-LABEL: vector_deinterleave_v2i64_v4i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v0, 2 ; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma ; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v0, 2 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vrgather.vi v9, v8, 1 ; CHECK-NEXT: vrgather.vi v9, v10, 1, v0.t @@ -194,10 +194,10 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_v2f64_v4f64(<4 x double> %vec) { ; CHECK-LABEL: vector_deinterleave_v2f64_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v0, 2 ; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma ; CHECK-NEXT: vslidedown.vi v10, v8, 2 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v0, 2 ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; CHECK-NEXT: vrgather.vi v9, v8, 1 ; CHECK-NEXT: vrgather.vi v9, v10, 1, v0.t diff --git a/llvm/test/CodeGen/X86/pr33349.ll b/llvm/test/CodeGen/X86/pr33349.ll --- a/llvm/test/CodeGen/X86/pr33349.ll +++ b/llvm/test/CodeGen/X86/pr33349.ll @@ -10,20 +10,20 @@ ; KNL: # %bb.0: # %bb ; KNL-NEXT: vpslld $31, %xmm0, %xmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 -; KNL-NEXT: kshiftrw $1, %k0, %k1 -; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: kshiftrw $2, %k0, %k1 ; KNL-NEXT: kshiftrw $1, %k1, %k2 -; KNL-NEXT: kmovw %k1, %ecx +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: testb $1, %al ; KNL-NEXT: fld1 ; KNL-NEXT: fldz ; KNL-NEXT: fld %st(0) ; KNL-NEXT: fcmovne %st(2), %st -; KNL-NEXT: testb $1, %cl +; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: testb $1, %al ; KNL-NEXT: fld %st(1) ; KNL-NEXT: fcmovne %st(3), %st -; KNL-NEXT: kmovw %k2, %eax +; KNL-NEXT: kshiftrw $1, %k0, %k1 +; KNL-NEXT: kmovw %k1, %eax ; KNL-NEXT: testb $1, %al ; KNL-NEXT: fld %st(2) ; KNL-NEXT: fcmovne %st(4), %st @@ -35,10 +35,10 @@ ; KNL-NEXT: fxch %st(3) ; KNL-NEXT: fstpt (%rdi) ; KNL-NEXT: fxch %st(1) -; KNL-NEXT: fstpt 30(%rdi) +; KNL-NEXT: fstpt 10(%rdi) ; KNL-NEXT: fxch %st(1) +; KNL-NEXT: fstpt 30(%rdi) ; KNL-NEXT: fstpt 20(%rdi) -; KNL-NEXT: fstpt 10(%rdi) ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; @@ -46,20 +46,20 @@ ; SKX: # %bb.0: # %bb ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 ; SKX-NEXT: vpmovd2m %xmm0, %k0 -; SKX-NEXT: kshiftrb $1, %k0, %k1 -; SKX-NEXT: kmovd %k1, %eax ; SKX-NEXT: kshiftrb $2, %k0, %k1 ; SKX-NEXT: kshiftrb $1, %k1, %k2 -; SKX-NEXT: kmovd %k1, %ecx +; SKX-NEXT: kmovd %k1, %eax ; SKX-NEXT: testb $1, %al ; SKX-NEXT: fld1 ; SKX-NEXT: fldz ; SKX-NEXT: fld %st(0) ; SKX-NEXT: fcmovne %st(2), %st -; SKX-NEXT: testb $1, %cl +; SKX-NEXT: kmovd %k2, %eax +; SKX-NEXT: testb $1, %al ; SKX-NEXT: fld %st(1) ; SKX-NEXT: fcmovne %st(3), %st -; SKX-NEXT: kmovd %k2, %eax +; SKX-NEXT: kshiftrb $1, %k0, %k1 +; SKX-NEXT: kmovd %k1, %eax ; SKX-NEXT: testb $1, %al ; SKX-NEXT: fld %st(2) ; SKX-NEXT: fcmovne %st(4), %st @@ -71,10 +71,10 @@ ; SKX-NEXT: fxch %st(3) ; SKX-NEXT: fstpt (%rdi) ; SKX-NEXT: fxch %st(1) -; SKX-NEXT: fstpt 30(%rdi) +; SKX-NEXT: fstpt 10(%rdi) ; SKX-NEXT: fxch %st(1) +; SKX-NEXT: fstpt 30(%rdi) ; SKX-NEXT: fstpt 20(%rdi) -; SKX-NEXT: fstpt 10(%rdi) ; SKX-NEXT: retq bb: %tmp = select <4 x i1> %m, <4 x x86_fp80> , <4 x x86_fp80> zeroinitializer diff --git a/llvm/test/CodeGen/X86/pr34177.ll b/llvm/test/CodeGen/X86/pr34177.ll --- a/llvm/test/CodeGen/X86/pr34177.ll +++ b/llvm/test/CodeGen/X86/pr34177.ll @@ -49,20 +49,20 @@ ; AVX512VL-LABEL: test: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 -; AVX512VL-NEXT: kshiftrb $2, %k0, %k1 -; AVX512VL-NEXT: kshiftrb $1, %k0, %k2 +; AVX512VL-NEXT: kshiftrb $1, %k0, %k1 +; AVX512VL-NEXT: kshiftrb $2, %k0, %k2 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: testb $1, %al ; AVX512VL-NEXT: fld1 ; AVX512VL-NEXT: fldz ; AVX512VL-NEXT: fld %st(0) ; AVX512VL-NEXT: fcmovne %st(2), %st -; AVX512VL-NEXT: kshiftrb $1, %k1, %k0 -; AVX512VL-NEXT: kmovd %k0, %eax +; AVX512VL-NEXT: kmovd %k1, %eax ; AVX512VL-NEXT: testb $1, %al ; AVX512VL-NEXT: fld %st(1) ; AVX512VL-NEXT: fcmovne %st(3), %st -; AVX512VL-NEXT: kmovd %k1, %eax +; AVX512VL-NEXT: kshiftrb $1, %k2, %k0 +; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: testb $1, %al ; AVX512VL-NEXT: fld %st(2) ; AVX512VL-NEXT: fcmovne %st(4), %st @@ -83,11 +83,11 @@ ; AVX512VL-NEXT: fadd %st, %st(0) ; AVX512VL-NEXT: fstpt (%rdi) ; AVX512VL-NEXT: fadd %st, %st(0) +; AVX512VL-NEXT: fstpt 20(%rdi) +; AVX512VL-NEXT: fadd %st, %st(0) ; AVX512VL-NEXT: fstpt 60(%rdi) ; AVX512VL-NEXT: fadd %st, %st(0) ; AVX512VL-NEXT: fstpt 40(%rdi) -; AVX512VL-NEXT: fadd %st, %st(0) -; AVX512VL-NEXT: fstpt 20(%rdi) %1 = icmp eq <4 x i64> , %a %2 = select <4 x i1> %1, <4 x x86_fp80> , <4 x x86_fp80> zeroinitializer %3 = fadd <4 x x86_fp80> %2, %2 diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll --- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll +++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll @@ -64,7 +64,7 @@ ; ; AVX512DQ-LABEL: mask_replication_factor2_vf4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0 ; AVX512DQ-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX512DQ-NEXT: vpmovd2m %ymm0, %k1 @@ -75,7 +75,7 @@ ; ; AVX512BW-LABEL: mask_replication_factor2_vf4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovq (%rdi), %k1 +; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512BW-NEXT: vpmovsxdq %xmm0, %ymm0 @@ -496,7 +496,7 @@ ; ; AVX512DQ-LABEL: mask_replication_factor3_vf2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,0,1,1,1,u,u> ; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 @@ -513,7 +513,7 @@ ; ; AVX512BW-LABEL: mask_replication_factor3_vf2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovq (%rdi), %k1 +; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512BW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,0,1,1,1,u,u> @@ -572,7 +572,7 @@ ; ; AVX512BW-LABEL: mask_replication_factor3_vf4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovq (%rdi), %k1 +; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,1,1,1,2,2,2,3,3,3,u,u,u,u> ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 @@ -840,29 +840,31 @@ ; AVX512BW-NEXT: kmovd (%rdi), %k0 ; AVX512BW-NEXT: kshiftrd $1, %k0, %k1 ; AVX512BW-NEXT: movw $-3, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kandw %k3, %k0, %k2 -; AVX512BW-NEXT: kmovq %k3, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k3 -; AVX512BW-NEXT: kshiftrw $14, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovd %eax, %k4 +; AVX512BW-NEXT: kmovw (%rdi), %k2 +; AVX512BW-NEXT: kandw %k4, %k2, %k3 +; AVX512BW-NEXT: kmovq %k4, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 +; AVX512BW-NEXT: korw %k4, %k3, %k3 ; AVX512BW-NEXT: movw $-5, %ax ; AVX512BW-NEXT: kmovd %eax, %k4 ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $13, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k3, %k2 ; AVX512BW-NEXT: movw $-9, %ax ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovq %k3, %k4 +; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-17, %ax -; AVX512BW-NEXT: kmovd %eax, %k5 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-33, %ax @@ -915,22 +917,22 @@ ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $4, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k2 +; AVX512BW-NEXT: kshiftrd $4, %k0, %k7 +; AVX512BW-NEXT: kshiftlw $15, %k7, %k2 ; AVX512BW-NEXT: kshiftrw $3, %k2, %k3 ; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF -; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kmovd %eax, %k5 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $2, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k7, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $5, %k0, %k2 @@ -940,142 +942,141 @@ ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $27, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k7 ; AVX512BW-NEXT: kshiftrd $26, %k0, %k1 ; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovq %k7, %k2 -; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k4, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovq %k6, %k2 +; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k7, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $28, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k7 +; AVX512BW-NEXT: kshiftrw $13, %k7, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $12, %k7, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $28, %k0, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $11, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k4, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $29, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $9, %k6, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $29, %k0, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $8, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k4, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $30, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $5, %k4, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k6, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $30, %k0, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k4, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $31, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k7 -; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $3, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $31, %k0, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k6, %k7 +; AVX512BW-NEXT: kshiftrw $2, %k7, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k6, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $21, %k0, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k6 +; AVX512BW-NEXT: kandw %k2, %k1, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k6, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $22, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k5, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $22, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $13, %k5, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $12, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $11, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $23, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 +; AVX512BW-NEXT: kshiftrd $23, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $10, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $24, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $9, %k5, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $8, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $24, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $25, %k0, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $5, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $25, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $2, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k2, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 @@ -1085,63 +1086,63 @@ ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k2, %k1 ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $17, %k0, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $12, %k2, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $11, %k2, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $18, %k0, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $9, %k2, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $8, %k2, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $19, %k0, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k2, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $5, %k2, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $20, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k5 +; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $2, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 @@ -1151,128 +1152,128 @@ ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $11, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 -; AVX512BW-NEXT: kshiftrd $10, %k0, %k4 -; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftrd $10, %k0, %k5 +; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $13, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k4, %k2 +; AVX512BW-NEXT: korw %k2, %k5, %k2 ; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $12, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k6 +; AVX512BW-NEXT: kshiftrd $12, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $11, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k4, %k6 +; AVX512BW-NEXT: kshiftrw $10, %k5, %k6 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $9, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrd $13, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $8, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $13, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k4, %k6 +; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $6, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $14, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $5, %k4, %k6 +; AVX512BW-NEXT: kshiftrd $14, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $4, %k4, %k6 +; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $3, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $15, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k6 +; AVX512BW-NEXT: kshiftrd $15, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k2} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $14, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $6, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $13, %k4, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $14, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftrd $6, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $13, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k4, %k6 +; AVX512BW-NEXT: kshiftrw $12, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $7, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $10, %k4, %k6 +; AVX512BW-NEXT: kshiftrw $11, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftrd $7, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $10, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k6 +; AVX512BW-NEXT: kshiftrw $9, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $8, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $8, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $7, %k4, %k6 +; AVX512BW-NEXT: kshiftrd $8, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k4, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $5, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrd $9, %k0, %k0 ; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $4, %k0, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $3, %k0, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 @@ -1446,18 +1447,19 @@ ; AVX512BW-NEXT: kmovq (%rdi), %k0 ; AVX512BW-NEXT: kshiftrq $1, %k0, %k1 ; AVX512BW-NEXT: movw $-3, %ax -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kandw %k3, %k0, %k2 -; AVX512BW-NEXT: kmovq %k3, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k3 -; AVX512BW-NEXT: kshiftrw $14, %k3, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovd %eax, %k4 +; AVX512BW-NEXT: kmovw (%rdi), %k2 +; AVX512BW-NEXT: kandw %k4, %k2, %k3 +; AVX512BW-NEXT: kmovq %k4, %k7 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 +; AVX512BW-NEXT: korw %k4, %k3, %k3 ; AVX512BW-NEXT: movw $-5, %ax ; AVX512BW-NEXT: kmovd %eax, %k4 ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $13, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k3, %k2 ; AVX512BW-NEXT: movw $-9, %ax ; AVX512BW-NEXT: kmovd %eax, %k3 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill @@ -1473,8 +1475,9 @@ ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-33, %ax ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovq %k3, %k5 +; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k2, %k1 ; AVX512BW-NEXT: movw $-65, %ax @@ -1487,9 +1490,8 @@ ; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: movw $-129, %ax ; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovq %k3, %k5 ; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k2, %k3 ; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF @@ -1547,47 +1549,47 @@ ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $59, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 -; AVX512BW-NEXT: kshiftrq $58, %k0, %k1 -; AVX512BW-NEXT: kmovq %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512BW-NEXT: kshiftrq $58, %k0, %k2 +; AVX512BW-NEXT: kmovq %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512BW-NEXT: kmovq %k7, %k3 ; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k2, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $13, %k1, %k7 +; AVX512BW-NEXT: korw %k7, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $12, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $60, %k0, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $11, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $61, %k0, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $8, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -1596,12 +1598,12 @@ ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $5, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k2, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kandw %k6, %k1, %k1 @@ -1609,8 +1611,8 @@ ; AVX512BW-NEXT: kshiftlw $15, %k2, %k7 ; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 @@ -1618,142 +1620,143 @@ ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $53, %k0, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k6 +; AVX512BW-NEXT: kandw %k3, %k1, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k6, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $54, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $55, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $56, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $57, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 8-byte Reload ; AVX512BW-NEXT: kshiftlw $14, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $48, %k0, %k1 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k2} {z} +; AVX512BW-NEXT: kshiftrq $48, %k0, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 +; AVX512BW-NEXT: kandw %k3, %k2, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k3, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $13, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k3, %k2 +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $49, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $50, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $51, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $52, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: korw %k1, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $43, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 @@ -1763,49 +1766,48 @@ ; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k3, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $44, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $45, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $46, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -1817,7 +1819,8 @@ ; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 @@ -1825,8 +1828,7 @@ ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $37, %k0, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k3 +; AVX512BW-NEXT: kandw %k2, %k1, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 @@ -1837,48 +1839,48 @@ ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrq $39, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k3, %k3 +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrq $40, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrq $41, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -1905,19 +1907,20 @@ ; AVX512BW-NEXT: kandw %k6, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $13, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $33, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $34, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 @@ -1927,8 +1930,7 @@ ; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -1937,14 +1939,14 @@ ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $36, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 @@ -1954,8 +1956,8 @@ ; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 @@ -1964,102 +1966,102 @@ ; AVX512BW-NEXT: korw %k1, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $27, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 ; AVX512BW-NEXT: kshiftrq $26, %k0, %k3 ; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $13, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k3, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k3, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $28, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $29, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftrq $29, %k0, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $30, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $3, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $31, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $21, %k0, %k1 +; AVX512BW-NEXT: korw %k7, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k2} {z} +; AVX512BW-NEXT: kshiftrq $21, %k0, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrq $22, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrq $23, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -2072,104 +2074,105 @@ ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrq $25, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k6 +; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 8-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 ; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm8 {%k2} {z} -; AVX512BW-NEXT: kshiftrq $16, %k0, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k3, %k1 +; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm8 {%k1} {z} +; AVX512BW-NEXT: kshiftrq $16, %k0, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $13, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k3, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $17, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $18, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $19, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $20, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $11, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 @@ -2179,29 +2182,28 @@ ; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $13, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $12, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $13, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k6 @@ -2210,7 +2212,8 @@ ; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload @@ -2219,12 +2222,12 @@ ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $3, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kandw %k7, %k2, %k2 @@ -2241,11 +2244,11 @@ ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k2} {z} ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kshiftrw $14, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $6, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 @@ -2269,8 +2272,7 @@ ; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -2279,8 +2281,7 @@ ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -2288,11 +2289,13 @@ ; AVX512BW-NEXT: kshiftrw $5, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrq $9, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -2358,7 +2361,7 @@ ; ; AVX512DQ-SLOW-LABEL: mask_replication_factor4_vf2: ; AVX512DQ-SLOW: # %bb.0: -; AVX512DQ-SLOW-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-SLOW-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %ymm0 ; AVX512DQ-SLOW-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] @@ -2370,7 +2373,7 @@ ; ; AVX512DQ-FAST-LABEL: mask_replication_factor4_vf2: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: kmovw (%rdi), %k0 +; AVX512DQ-FAST-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] ; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 @@ -2382,7 +2385,7 @@ ; ; AVX512BW-SLOW-LABEL: mask_replication_factor4_vf2: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: kmovq (%rdi), %k1 +; AVX512BW-SLOW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512BW-SLOW-NEXT: vpmovsxdq %xmm0, %xmm0 @@ -2395,7 +2398,7 @@ ; ; AVX512BW-FAST-LABEL: mask_replication_factor4_vf2: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: kmovq (%rdi), %k1 +; AVX512BW-FAST-NEXT: kmovw (%rdi), %k1 ; AVX512BW-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] @@ -2408,7 +2411,7 @@ ; ; AVX512VBMI-SLOW-LABEL: mask_replication_factor4_vf2: ; AVX512VBMI-SLOW: # %bb.0: -; AVX512VBMI-SLOW-NEXT: kmovq (%rdi), %k1 +; AVX512VBMI-SLOW-NEXT: kmovw (%rdi), %k1 ; AVX512VBMI-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512VBMI-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512VBMI-SLOW-NEXT: vpmovsxdq %xmm0, %xmm0 @@ -2421,7 +2424,7 @@ ; ; AVX512VBMI-FAST-LABEL: mask_replication_factor4_vf2: ; AVX512VBMI-FAST: # %bb.0: -; AVX512VBMI-FAST-NEXT: kmovq (%rdi), %k1 +; AVX512VBMI-FAST-NEXT: kmovw (%rdi), %k1 ; AVX512VBMI-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512VBMI-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] @@ -2467,7 +2470,7 @@ ; ; AVX512BW-LABEL: mask_replication_factor4_vf4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovq (%rdi), %k1 +; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 @@ -3151,7 +3154,7 @@ ; ; AVX512BW-LABEL: mask_replication_factor5_vf2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovq (%rdi), %k1 +; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,1,1,1,1,1,u,u,u,u,u,u> ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 @@ -3215,7 +3218,7 @@ ; ; AVX512BW-LABEL: mask_replication_factor5_vf4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovq (%rdi), %k0 +; AVX512BW-NEXT: kmovd (%rdi), %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 @@ -3582,12 +3585,14 @@ ; ; AVX512BW-LABEL: mask_replication_factor5_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovd (%rdi), %k0 -; AVX512BW-NEXT: kshiftrd $1, %k0, %k1 +; AVX512BW-NEXT: kmovd (%rdi), %k5 +; AVX512BW-NEXT: kshiftrd $1, %k5, %k1 ; AVX512BW-NEXT: movw $-3, %ax -; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: kandw %k6, %k0, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k2 +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kmovw (%rdi), %k2 +; AVX512BW-NEXT: kandw %k3, %k2, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 ; AVX512BW-NEXT: movw $-5, %ax @@ -3634,23 +3639,23 @@ ; AVX512BW-NEXT: kshiftrw $7, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF -; AVX512BW-NEXT: kmovd %eax, %k7 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kmovd %eax, %k3 +; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k2, %k1 ; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k2, %k1, %k3 -; AVX512BW-NEXT: kshiftrd $2, %k0, %k1 +; AVX512BW-NEXT: kshiftrd $2, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 ; AVX512BW-NEXT: kshiftrw $5, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 ; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF -; AVX512BW-NEXT: kmovd %eax, %k4 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovd %eax, %k7 +; AVX512BW-NEXT: kandw %k7, %k3, %k3 +; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $4, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 ; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF @@ -3666,29 +3671,29 @@ ; AVX512BW-NEXT: kshiftrw $2, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k3, %k2 ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovd %eax, %k6 +; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k2, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $3, %k0, %k2 +; AVX512BW-NEXT: kshiftrd $3, %k5, %k2 ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $29, %k0, %k1 +; AVX512BW-NEXT: kshiftrd $29, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 -; AVX512BW-NEXT: kshiftrd $28, %k0, %k1 +; AVX512BW-NEXT: kshiftrd $28, %k5, %k1 ; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k6, %k1, %k3 -; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k3 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $13, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -3705,7 +3710,7 @@ ; AVX512BW-NEXT: korw %k2, %k3, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $30, %k0, %k3 +; AVX512BW-NEXT: kshiftrd $30, %k5, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k4 ; AVX512BW-NEXT: korw %k4, %k2, %k2 @@ -3717,16 +3722,16 @@ ; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $7, %k3, %k4 ; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k3, %k4 ; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $5, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $31, %k0, %k3 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kshiftrd $31, %k5, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k4 ; AVX512BW-NEXT: kshiftrw $4, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k2, %k2 @@ -3738,98 +3743,96 @@ ; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $2, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 ; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm1 {%k2} {z} -; AVX512BW-NEXT: kshiftrd $25, %k0, %k2 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k6, %k2, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512BW-NEXT: kshiftrd $25, %k5, %k3 +; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k3, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $14, %k2, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kandw %k5, %k3, %k3 -; AVX512BW-NEXT: kshiftrd $26, %k0, %k7 -; AVX512BW-NEXT: kmovq %k0, %k4 -; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: korw %k7, %k4, %k4 +; AVX512BW-NEXT: kandw %k0, %k4, %k4 +; AVX512BW-NEXT: kshiftrd $26, %k5, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k7, %k7 ; AVX512BW-NEXT: kshiftrw $13, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $12, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kandw %k2, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $11, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kandw %k2, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $10, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 +; AVX512BW-NEXT: kandw %k0, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $9, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrd $27, %k4, %k6 +; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: kshiftrd $27, %k5, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 +; AVX512BW-NEXT: korw %k7, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kandw %k2, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: korw %k7, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: korw %k7, %k4, %k4 +; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: korw %k7, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k4, %k4 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload ; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 +; AVX512BW-NEXT: korw %k7, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: korw %k7, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k1 -; AVX512BW-NEXT: korw %k1, %k3, %k1 +; AVX512BW-NEXT: korw %k1, %k4, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $22, %k0, %k3 -; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftrd $22, %k5, %k0 +; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k3, %k7 +; AVX512BW-NEXT: kandw %k1, %k0, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $13, %k3, %k7 +; AVX512BW-NEXT: kshiftrw $13, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k5, %k6, %k6 -; AVX512BW-NEXT: kshiftrd $23, %k0, %k7 +; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kshiftrd $23, %k5, %k7 +; AVX512BW-NEXT: kmovq %k5, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k7, %k7 ; AVX512BW-NEXT: kshiftrw $12, %k7, %k5 ; AVX512BW-NEXT: korw %k5, %k6, %k5 @@ -3849,7 +3852,8 @@ ; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $8, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k5, %k5 ; AVX512BW-NEXT: kshiftrd $24, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 @@ -3858,11 +3862,10 @@ ; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 @@ -3875,76 +3878,77 @@ ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k4, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $14, %k2, %k3 +; AVX512BW-NEXT: korw %k3, %k5, %k3 +; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 +; AVX512BW-NEXT: korw %k7, %k3, %k2 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm3 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $19, %k0, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k4 +; AVX512BW-NEXT: kandw %k7, %k2, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k6 ; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $14, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrd $20, %k0, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $11, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $8, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $7, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrd $21, %k0, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kandw %k3, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: korw %k6, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $2, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k4, %k1 +; AVX512BW-NEXT: korw %k1, %k3, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload @@ -3967,16 +3971,16 @@ ; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k3, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrd $17, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k3, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k3, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k3, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 @@ -4020,53 +4024,53 @@ ; AVX512BW-NEXT: kshiftrd $12, %k0, %k3 ; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k2 +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: korw %k4, %k3, %k3 +; AVX512BW-NEXT: kandw %k6, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: korw %k4, %k3, %k3 +; AVX512BW-NEXT: kandw %k7, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: korw %k4, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: korw %k4, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $14, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k3, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k4 +; AVX512BW-NEXT: kshiftrd $14, %k0, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $9, %k3, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $8, %k3, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $7, %k3, %k4 +; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k3, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $5, %k3, %k3 +; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $15, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k4 +; AVX512BW-NEXT: kshiftrd $15, %k0, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k3, %k4 ; AVX512BW-NEXT: kshiftrw $4, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload @@ -4075,17 +4079,17 @@ ; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 +; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k4, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm6 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $9, %k0, %k2 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftrd $9, %k0, %k3 +; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k1 +; AVX512BW-NEXT: kandw %k1, %k3, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k3, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 ; AVX512BW-NEXT: korw %k5, %k4, %k4 @@ -4101,8 +4105,7 @@ ; AVX512BW-NEXT: kandw %k7, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $11, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: kandw %k2, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $10, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -4123,15 +4126,16 @@ ; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $4, %k5, %k5 ; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kandw %k3, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload ; AVX512BW-NEXT: kshiftlw $15, %k7, %k5 ; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 @@ -4140,63 +4144,63 @@ ; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $2, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k4, %k4 -; AVX512BW-NEXT: kshiftlw $14, %k7, %k3 -; AVX512BW-NEXT: korw %k3, %k4, %k3 -; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k3} {z} +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: kshiftlw $14, %k7, %k2 +; AVX512BW-NEXT: korw %k2, %k4, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k2} {z} ; AVX512BW-NEXT: kshiftrd $6, %k0, %k4 ; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k4, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k4, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 ; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $14, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $13, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrd $7, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k5, %k5 ; AVX512BW-NEXT: kshiftrd $8, %k0, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k5, %k5 +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload @@ -4204,69 +4208,69 @@ ; AVX512BW-NEXT: kshiftrw $3, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $2, %k1, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k5, %k1 +; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm8 {%k1} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $14, %k5, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k5, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k5, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $4, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k5 +; AVX512BW-NEXT: kshiftrw $14, %k5, %k3 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $13, %k5, %k3 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $12, %k5, %k3 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $4, %k0, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $11, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k2, %k5 +; AVX512BW-NEXT: kshiftrw $10, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k5 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $9, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $8, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $5, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $7, %k3, %k3 +; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $5, %k0, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $6, %k0, %k3 +; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $5, %k0, %k3 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $4, %k0, %k3 +; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $3, %k0, %k3 +; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 @@ -4514,8 +4518,9 @@ ; AVX512BW-NEXT: movw $-3, %ax ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k5, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k2 +; AVX512BW-NEXT: kmovw (%rdi), %k2 +; AVX512BW-NEXT: kandw %k1, %k2, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 ; AVX512BW-NEXT: movw $-5, %ax @@ -4635,8 +4640,8 @@ ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -4645,12 +4650,12 @@ ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload @@ -4694,12 +4699,11 @@ ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $8, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 @@ -4712,11 +4716,12 @@ ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -4763,29 +4768,29 @@ ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $12, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload @@ -4827,15 +4832,15 @@ ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovq %k4, %k3 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -4844,15 +4849,16 @@ ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -4890,32 +4896,32 @@ ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $18, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -4950,11 +4956,12 @@ ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -4963,23 +4970,22 @@ ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $22, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -4991,8 +4997,8 @@ ; AVX512BW-NEXT: kandw %k4, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5001,20 +5007,20 @@ ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5031,11 +5037,12 @@ ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5044,8 +5051,8 @@ ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -5056,60 +5063,59 @@ ; AVX512BW-NEXT: kandw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $26, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $27, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $28, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -5122,19 +5128,20 @@ ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5143,19 +5150,19 @@ ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5164,15 +5171,16 @@ ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -5185,12 +5193,10 @@ ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload @@ -5207,16 +5213,16 @@ ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5233,11 +5239,12 @@ ; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -5267,18 +5274,17 @@ ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 +; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $37, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 @@ -5288,20 +5294,20 @@ ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $38, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -5336,7 +5342,8 @@ ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5345,7 +5352,8 @@ ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k0 @@ -5355,8 +5363,7 @@ ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5365,38 +5372,37 @@ ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $42, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5405,20 +5411,20 @@ ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5431,31 +5437,32 @@ ; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512BW-NEXT: korw %k6, %k0, %k6 ; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k6} {z} -; AVX512BW-NEXT: kandw %k2, %k1, %k0 +; AVX512BW-NEXT: kandw %k3, %k1, %k0 ; AVX512BW-NEXT: kshiftrq $45, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5480,22 +5487,21 @@ ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $47, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -5508,35 +5514,36 @@ ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $49, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 +; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5545,12 +5552,12 @@ ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload @@ -5589,18 +5596,17 @@ ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $53, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 @@ -5609,15 +5615,16 @@ ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5644,8 +5651,8 @@ ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -5656,7 +5663,7 @@ ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5665,28 +5672,29 @@ ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $57, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -5707,49 +5715,49 @@ ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $59, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $60, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -5762,20 +5770,20 @@ ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -5784,35 +5792,35 @@ ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $63, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k4 ; AVX512BW-NEXT: korw %k4, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k5, %k2 ; AVX512BW-NEXT: korw %k2, %k0, %k0 @@ -5884,7 +5892,7 @@ ; ; AVX512BW-LABEL: mask_replication_factor6_vf2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovq (%rdi), %k1 +; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,1,1,1,1,1,1,u,u,u,u> ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 @@ -5989,7 +5997,7 @@ ; ; AVX512BW-LABEL: mask_replication_factor6_vf4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovq (%rdi), %k0 +; AVX512BW-NEXT: kmovd (%rdi), %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,u,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 @@ -6369,8 +6377,9 @@ ; AVX512BW-NEXT: movw $-3, %ax ; AVX512BW-NEXT: kmovd %eax, %k0 ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k5, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k1 +; AVX512BW-NEXT: kmovw (%rdi), %k1 +; AVX512BW-NEXT: kandw %k0, %k1, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-5, %ax @@ -6380,15 +6389,15 @@ ; AVX512BW-NEXT: kshiftrw $13, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-9, %ax -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-17, %ax ; AVX512BW-NEXT: kmovd %eax, %k7 ; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: movw $-17, %ax +; AVX512BW-NEXT: kmovd %eax, %k0 +; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kandw %k0, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-33, %ax @@ -6418,15 +6427,15 @@ ; AVX512BW-NEXT: kshiftrw $7, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF -; AVX512BW-NEXT: kmovd %eax, %k6 -; AVX512BW-NEXT: kandw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $6, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF ; AVX512BW-NEXT: kmovd %eax, %k0 ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $6, %k1, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF +; AVX512BW-NEXT: kmovd %eax, %k6 +; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $5, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF @@ -6468,107 +6477,107 @@ ; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 +; AVX512BW-NEXT: korw %k4, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $13, %k2, %k4 +; AVX512BW-NEXT: korw %k4, %k1, %k1 ; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $30, %k5, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k3, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k3, %k7 +; AVX512BW-NEXT: kshiftrw $12, %k2, %k4 +; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrd $30, %k5, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $11, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k3, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $10, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k3, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $9, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k3, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $8, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k6, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k3 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $7, %k4, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k6, %k1, %k4 ; AVX512BW-NEXT: kshiftrd $31, %k5, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k7, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kandw %k2, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kandw %k2, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k4, %k4 ; AVX512BW-NEXT: kshiftlw $14, %k7, %k6 -; AVX512BW-NEXT: korw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 -; AVX512BW-NEXT: korw %k1, %k3, %k1 +; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 +; AVX512BW-NEXT: korw %k1, %k4, %k1 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $26, %k5, %k3 -; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k6 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 -; AVX512BW-NEXT: kshiftrd $27, %k5, %k7 -; AVX512BW-NEXT: kmovq %k5, %k3 +; AVX512BW-NEXT: kmovq %k5, %k1 +; AVX512BW-NEXT: kshiftrd $26, %k5, %k5 ; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k5, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k4 +; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k4, %k7 +; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kandw %k0, %k6, %k6 +; AVX512BW-NEXT: kshiftrd $27, %k1, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k7, %k7 ; AVX512BW-NEXT: kshiftrw $13, %k7, %k5 ; AVX512BW-NEXT: korw %k5, %k6, %k5 -; AVX512BW-NEXT: kandw %k4, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $12, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $11, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $10, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $9, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $8, %k7, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrd $28, %k3, %k6 +; AVX512BW-NEXT: kshiftrd $28, %k1, %k6 +; AVX512BW-NEXT: kmovq %k1, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 @@ -6580,149 +6589,147 @@ ; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k5, %k4 -; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: korw %k2, %k4, %k2 -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k2} {z} +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k5, %k5 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $24, %k0, %k2 +; AVX512BW-NEXT: kshiftlw $14, %k0, %k3 +; AVX512BW-NEXT: korw %k3, %k5, %k3 +; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: korw %k0, %k3, %k2 +; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k2} {z} +; AVX512BW-NEXT: kmovq %k4, %k0 +; AVX512BW-NEXT: kshiftrd $24, %k4, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k4 +; AVX512BW-NEXT: kandw %k4, %k2, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $14, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k4, %k4 +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $13, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k4, %k4 +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 +; AVX512BW-NEXT: kandw %k6, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k4, %k4 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k4, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $25, %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 +; AVX512BW-NEXT: korw %k2, %k3, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftrd $25, %k0, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $9, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k5 +; AVX512BW-NEXT: kshiftrw $8, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $7, %k4, %k5 +; AVX512BW-NEXT: kshiftrw $7, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $6, %k4, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $6, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k4, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $5, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $4, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $3, %k0, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k4 -; AVX512BW-NEXT: kmovq %k0, %k1 -; AVX512BW-NEXT: korw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k0, %k3 +; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $3, %k5, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $2, %k5, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k1, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: korw %k5, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $21, %k2, %k1 -; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kmovq %k0, %k1 +; AVX512BW-NEXT: kshiftrd $21, %k0, %k3 +; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kandw %k4, %k3, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 +; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k3, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k5 -; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k5, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 +; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $13, %k3, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k5, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k5, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kandw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftrd $22, %k2, %k4 -; AVX512BW-NEXT: kmovq %k2, %k6 +; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $12, %k3, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kshiftrd $22, %k1, %k4 +; AVX512BW-NEXT: kmovq %k1, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $11, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kandw %k7, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $10, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $8, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $7, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k4 -; AVX512BW-NEXT: kshiftrd $23, %k6, %k5 -; AVX512BW-NEXT: kmovq %k6, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k4 +; AVX512BW-NEXT: kshiftrd $23, %k7, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k2 +; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $3, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $2, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k4, %k4 @@ -6730,233 +6737,232 @@ ; AVX512BW-NEXT: korw %k5, %k4, %k4 ; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 -; AVX512BW-NEXT: korw %k3, %k4, %k3 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k3} {z} +; AVX512BW-NEXT: korw %k2, %k4, %k2 +; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k2} {z} ; AVX512BW-NEXT: kmovq %k7, %k4 ; AVX512BW-NEXT: kshiftrd $18, %k7, %k6 ; AVX512BW-NEXT: kmovd %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k3 -; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k3, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k6, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k6, %k2 +; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrd $19, %k7, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $9, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrd $20, %k4, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $14, %k1, %k3 +; AVX512BW-NEXT: korw %k3, %k5, %k3 +; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: korw %k1, %k3, %k1 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $16, %k0, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k2 +; AVX512BW-NEXT: kshiftrd $16, %k4, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: korw %k1, %k3, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $17, %k0, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k5 +; AVX512BW-NEXT: kshiftrd $17, %k4, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $9, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k5 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $8, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $7, %k3, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k5 +; AVX512BW-NEXT: kshiftrw $5, %k3, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $4, %k3, %k3 +; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $3, %k3, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $3, %k2, %k3 +; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k3, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $2, %k2, %k3 +; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k1, %k1 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k0, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k0, %k3 +; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $13, %k0, %k2 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kmovq %k4, %k0 +; AVX512BW-NEXT: kshiftrd $13, %k4, %k3 +; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kandw %k6, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kandw %k7, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrd $14, %k0, %k4 +; AVX512BW-NEXT: kandw %k1, %k3, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k3, %k4 +; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k4, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $13, %k4, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $12, %k4, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kshiftrd $14, %k0, %k3 ; AVX512BW-NEXT: kmovq %k0, %k7 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $10, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $11, %k3, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $8, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $10, %k3, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $7, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $9, %k3, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 +; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $8, %k3, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k4 +; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $7, %k3, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $6, %k3, %k3 +; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k3 ; AVX512BW-NEXT: kshiftrd $15, %k7, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k4, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k2 +; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 +; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 +; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $3, %k2, %k6 +; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $2, %k2, %k6 +; AVX512BW-NEXT: korw %k6, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k3, %k3 ; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k4, %k4 -; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 -; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 -; AVX512BW-NEXT: korw %k3, %k4, %k3 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k3} {z} -; AVX512BW-NEXT: kmovq %k7, %k3 -; AVX512BW-NEXT: kshiftrd $10, %k7, %k0 -; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k1, %k0, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 +; AVX512BW-NEXT: korw %k2, %k3, %k2 +; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k2} {z} +; AVX512BW-NEXT: kshiftrd $10, %k7, %k2 +; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kandw %k1, %k2, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k1 +; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrd $11, %k7, %k6 +; AVX512BW-NEXT: kmovq %k7, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -6967,135 +6973,137 @@ ; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $8, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrd $12, %k3, %k6 +; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kshiftrd $12, %k2, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kandw %k4, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; AVX512BW-NEXT: kandw %k7, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k0, %k4 +; AVX512BW-NEXT: korw %k4, %k5, %k4 +; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: korw %k0, %k4, %k1 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm8 {%k1} {z} -; AVX512BW-NEXT: kshiftrd $8, %k3, %k1 +; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftrd $8, %k2, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k2 +; AVX512BW-NEXT: kandw %k6, %k1, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: korw %k1, %k4, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $9, %k3, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k5 +; AVX512BW-NEXT: kshiftrd $9, %k2, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k5 +; AVX512BW-NEXT: kshiftrw $8, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k5 +; AVX512BW-NEXT: kshiftrw $7, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $5, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $4, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $3, %k3, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $3, %k2, %k4 +; AVX512BW-NEXT: korw %k4, %k1, %k1 ; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k3, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $2, %k2, %k4 +; AVX512BW-NEXT: kmovq %k2, %k5 +; AVX512BW-NEXT: korw %k4, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 +; AVX512BW-NEXT: korw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $5, %k1, %k2 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k6, %k2, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k7 +; AVX512BW-NEXT: kshiftrd $5, %k1, %k4 +; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kandw %k6, %k4, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k7 ; AVX512BW-NEXT: kshiftrw $14, %k7, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $13, %k7, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k7, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k3, %k3 ; AVX512BW-NEXT: kshiftrd $6, %k1, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $11, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $8, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 ; AVX512BW-NEXT: kandw %k0, %k3, %k3 @@ -7123,8 +7131,8 @@ ; AVX512BW-NEXT: kandw %k6, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k4, %k4 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k4, %k4 +; AVX512BW-NEXT: kmovq %k2, %k6 +; AVX512BW-NEXT: kandw %k2, %k4, %k4 ; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 ; AVX512BW-NEXT: korw %k5, %k4, %k4 ; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 @@ -7132,33 +7140,34 @@ ; AVX512BW-NEXT: korw %k3, %k4, %k3 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k3} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kshiftrw $14, %k4, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrd $3, %k1, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $13, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $12, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $11, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $10, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k4, %k5 ; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $8, %k4, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 @@ -7464,10 +7473,11 @@ ; AVX512BW-NEXT: movw $-3, %ax ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k5, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: kandw %k1, %k0, %k3 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k0 +; AVX512BW-NEXT: korw %k0, %k3, %k0 ; AVX512BW-NEXT: movw $-5, %ax ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill @@ -7568,20 +7578,20 @@ ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -7596,8 +7606,8 @@ ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload @@ -7794,8 +7804,8 @@ ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k0 @@ -7820,8 +7830,8 @@ ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -7849,18 +7859,18 @@ ; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k1 ; AVX512BW-NEXT: kshiftrq $15, %k7, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovq %k4, %k3 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload @@ -7886,51 +7896,49 @@ ; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $17, %k5, %k1 -; AVX512BW-NEXT: kmovq %k5, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $18, %k7, %k1 -; AVX512BW-NEXT: kmovq %k7, %k3 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $18, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -7947,17 +7955,16 @@ ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovq %k3, %k7 -; AVX512BW-NEXT: kshiftrq $19, %k3, %k1 +; AVX512BW-NEXT: kshiftrq $19, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -7974,31 +7981,31 @@ ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $20, %k7, %k1 +; AVX512BW-NEXT: kshiftrq $20, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kmovq %k7, %k4 -; AVX512BW-NEXT: kshiftrq $21, %k7, %k1 +; AVX512BW-NEXT: kshiftrq $21, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -8014,63 +8021,61 @@ ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $22, %k4, %k1 -; AVX512BW-NEXT: kmovq %k4, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $22, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 +; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $23, %k7, %k6 +; AVX512BW-NEXT: kshiftrq $23, %k5, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z} -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $24, %k5, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 @@ -8088,12 +8093,12 @@ ; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $25, %k5, %k1 -; AVX512BW-NEXT: kmovq %k5, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -8101,16 +8106,15 @@ ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 +; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload @@ -8119,7 +8123,7 @@ ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $26, %k7, %k1 +; AVX512BW-NEXT: kshiftrq $26, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 @@ -8140,8 +8144,8 @@ ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $27, %k7, %k1 +; AVX512BW-NEXT: kmovq %k5, %k7 +; AVX512BW-NEXT: kshiftrq $27, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -8153,44 +8157,46 @@ ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $28, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $29, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -8207,7 +8213,8 @@ ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kandw %k3, %k0, %k0 @@ -8216,43 +8223,42 @@ ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kandw %k2, %k0, %k1 ; AVX512BW-NEXT: kshiftrq $31, %k7, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -8286,104 +8292,102 @@ ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $33, %k5, %k1 -; AVX512BW-NEXT: kmovq %k5, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $34, %k7, %k1 -; AVX512BW-NEXT: kmovq %k7, %k4 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $34, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovq %k4, %k7 -; AVX512BW-NEXT: kshiftrq $35, %k4, %k1 +; AVX512BW-NEXT: kshiftrq $35, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $36, %k7, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $36, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovq %k7, %k3 -; AVX512BW-NEXT: kshiftrq $37, %k7, %k1 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $37, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -8391,30 +8395,29 @@ ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 +; AVX512BW-NEXT: kandw %k4, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $38, %k3, %k1 +; AVX512BW-NEXT: kshiftrq $38, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -8425,107 +8428,105 @@ ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $39, %k3, %k6 +; AVX512BW-NEXT: kandw %k3, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $39, %k5, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k1} {z} -; AVX512BW-NEXT: kmovq %k3, %k7 -; AVX512BW-NEXT: kshiftrq $40, %k3, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $40, %k5, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $41, %k7, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $41, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $42, %k7, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $42, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k7} {z} -; AVX512BW-NEXT: kandw %k3, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $43, %k7, %k1 +; AVX512BW-NEXT: kshiftrq $43, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -8533,14 +8534,13 @@ ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -8550,31 +8550,33 @@ ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $44, %k7, %k1 +; AVX512BW-NEXT: kshiftrq $44, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $45, %k7, %k1 +; AVX512BW-NEXT: kshiftrq $45, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -8582,72 +8584,70 @@ ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $46, %k7, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $46, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $47, %k7, %k6 +; AVX512BW-NEXT: kshiftrq $47, %k5, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k1} {z} -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $48, %k5, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -8655,46 +8655,45 @@ ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $49, %k5, %k1 -; AVX512BW-NEXT: kmovq %k5, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $50, %k2, %k1 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $50, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 @@ -8702,8 +8701,8 @@ ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -8716,17 +8715,16 @@ ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovq %k2, %k7 -; AVX512BW-NEXT: kshiftrq $51, %k2, %k1 +; AVX512BW-NEXT: kshiftrq $51, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -8737,37 +8735,36 @@ ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $52, %k7, %k1 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $52, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $53, %k7, %k1 -; AVX512BW-NEXT: kmovq %k7, %k4 +; AVX512BW-NEXT: kshiftrq $53, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -8783,85 +8780,83 @@ ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kmovq %k4, %k7 -; AVX512BW-NEXT: kshiftrq $54, %k4, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $54, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $55, %k7, %k6 +; AVX512BW-NEXT: kandw %k2, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $55, %k5, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k1} {z} -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $56, %k5, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $57, %k5, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 @@ -8870,18 +8865,20 @@ ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -8902,8 +8899,7 @@ ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm21 {%k7} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k0 +; AVX512BW-NEXT: kandw %k2, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -8912,18 +8908,17 @@ ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 @@ -8937,22 +8932,24 @@ ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 +; AVX512BW-NEXT: korw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -8991,40 +8988,40 @@ ; AVX512BW-NEXT: kandw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $63, %k5, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $63, %k5, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k6, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k4 -; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k0 +; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k2, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k1, %k0, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $3, %k0, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $2, %k0, %k4 +; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z} ; AVX512BW-NEXT: vmovdqa64 %zmm23, 1472(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm22, 1408(%rdx) @@ -9098,7 +9095,7 @@ ; ; AVX512BW-LABEL: mask_replication_factor7_vf2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovq (%rdi), %k1 +; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,0,1,1,1,1,1,1,1,u,u> ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 @@ -9108,9 +9105,9 @@ ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx) -; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX512BW-NEXT: vmovq %xmm1, 48(%rdx) ; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx) +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512BW-NEXT: vmovq %xmm0, 48(%rdx) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 @@ -9167,7 +9164,7 @@ ; ; AVX512BW-LABEL: mask_replication_factor7_vf4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovq (%rdi), %k0 +; AVX512BW-NEXT: kmovd (%rdi), %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,u,u,u,u> ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 @@ -9684,18 +9681,19 @@ ; ; AVX512BW-LABEL: mask_replication_factor7_vf32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovd (%rdi), %k6 ; AVX512BW-NEXT: movw $-3, %ax -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kandw %k0, %k6, %k1 -; AVX512BW-NEXT: kmovq %k0, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k1 +; AVX512BW-NEXT: kmovq %k2, %k6 +; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-5, %ax ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovq %k2, %k3 +; AVX512BW-NEXT: kmovq %k2, %k4 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $13, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 @@ -9727,20 +9725,22 @@ ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrd $1, %k6, %k0 +; AVX512BW-NEXT: kmovd (%rdi), %k3 +; AVX512BW-NEXT: kshiftrd $1, %k3, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovq %k2, %k7 +; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $7, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF -; AVX512BW-NEXT: kmovd %eax, %k7 -; AVX512BW-NEXT: kandw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kmovd %eax, %k5 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kshiftrw $6, %k0, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF @@ -9771,7 +9771,7 @@ ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrd $2, %k6, %k2 +; AVX512BW-NEXT: kshiftrd $2, %k3, %k2 ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kshiftlw $14, %k2, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 @@ -9781,362 +9781,357 @@ ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: korw %k1, %k0, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: kmovq %k6, %k2 -; AVX512BW-NEXT: kshiftrd $29, %k6, %k1 +; AVX512BW-NEXT: kshiftrd $29, %k3, %k1 ; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovq %k4, %k6 -; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k4, %k1, %k0 +; AVX512BW-NEXT: kandw %k6, %k1, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrd $30, %k2, %k1 -; AVX512BW-NEXT: kmovq %k2, %k4 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k2 +; AVX512BW-NEXT: korw %k2, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k2 +; AVX512BW-NEXT: kshiftrd $30, %k3, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $13, %k0, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $12, %k0, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $11, %k0, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $10, %k0, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $9, %k0, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $8, %k0, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 +; AVX512BW-NEXT: korw %k0, %k2, %k0 +; AVX512BW-NEXT: kandw %k5, %k0, %k2 +; AVX512BW-NEXT: kshiftrd $31, %k3, %k4 +; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kshiftlw $15, %k4, %k0 +; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k7, %k0, %k3 -; AVX512BW-NEXT: kshiftrd $31, %k4, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 +; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k3, %k3 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k3, %k3 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k3, %k0 -; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k1, %k0, %k1 -; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $27, %k2, %k1 -; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k6, %k1, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 -; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k6, %k7 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: korw %k0, %k2, %k2 +; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm1 {%k2} {z} +; AVX512BW-NEXT: kshiftrd $27, %k3, %k2 +; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k2, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k4 +; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $13, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k6, %k7 +; AVX512BW-NEXT: kandw %k6, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k4, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k7 -; AVX512BW-NEXT: kshiftrd $28, %k2, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k7 +; AVX512BW-NEXT: kshiftrd $28, %k3, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k7, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k6 +; AVX512BW-NEXT: kandw %k5, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k6, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $4, %k7, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k7, %k6 +; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 +; AVX512BW-NEXT: kmovq %k3, %k1 ; AVX512BW-NEXT: korw %k6, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k2, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 ; AVX512BW-NEXT: korw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: korw %k7, %k0, %k2 -; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm2 {%k2} {z} +; AVX512BW-NEXT: korw %k1, %k0, %k1 +; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm2 {%k1} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload ; AVX512BW-NEXT: kshiftrd $25, %k6, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k0 +; AVX512BW-NEXT: korw %k0, %k5, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 +; AVX512BW-NEXT: korw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 +; AVX512BW-NEXT: korw %k5, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 +; AVX512BW-NEXT: korw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $10, %k1, %k5 +; AVX512BW-NEXT: korw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k5 +; AVX512BW-NEXT: kshiftrd $26, %k6, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 +; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 +; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 +; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 +; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k2 +; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 +; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 +; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 +; AVX512BW-NEXT: korw %k0, %k5, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $2, %k6, %k5 +; AVX512BW-NEXT: korw %k5, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k2, %k4 +; AVX512BW-NEXT: korw %k4, %k0, %k0 +; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 +; AVX512BW-NEXT: korw %k6, %k0, %k2 +; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k2} {z} +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload +; AVX512BW-NEXT: kshiftrd $23, %k2, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512BW-NEXT: kshiftrd $22, %k2, %k4 +; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kmovq %k2, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k4, %k2 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k5 -; AVX512BW-NEXT: kshiftrd $26, %k6, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k4, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $9, %k0, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $2, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k6, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $23, %k3, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k2 -; AVX512BW-NEXT: kshiftrd $22, %k3, %k5 -; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovq %k3, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k3 -; AVX512BW-NEXT: kshiftrw $14, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kandw %k7, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $10, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 -; AVX512BW-NEXT: kshiftrd $24, %k6, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k3, %k5 +; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 +; AVX512BW-NEXT: korw %k0, %k2, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k2 +; AVX512BW-NEXT: kshiftrd $24, %k6, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k5 ; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 ; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $2, %k5, %k5 ; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k0, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k2} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $20, %k3, %k5 -; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k5, %k6 -; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k6, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512BW-NEXT: korw %k0, %k2, %k0 +; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 +; AVX512BW-NEXT: korw %k1, %k0, %k1 +; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k1} {z} +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload +; AVX512BW-NEXT: kshiftrd $20, %k6, %k0 +; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kandw %k3, %k0, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 +; AVX512BW-NEXT: korw %k5, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k5 -; AVX512BW-NEXT: kshiftrd $21, %k3, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k6 +; AVX512BW-NEXT: kandw %k0, %k1, %k5 +; AVX512BW-NEXT: kshiftrd $21, %k6, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $12, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k6 +; AVX512BW-NEXT: kshiftrw $11, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k2, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $10, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kandw %k7, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 +; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 +; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k6 +; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k2, %k5 +; AVX512BW-NEXT: kandw %k0, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $6, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k5, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k5 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $15, %k7, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k7, %k1 +; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k2, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k2, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $2, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kandw %k4, %k5, %k5 -; AVX512BW-NEXT: kshiftlw $14, %k7, %k1 -; AVX512BW-NEXT: korw %k1, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k7, %k4 +; AVX512BW-NEXT: korw %k4, %k5, %k4 +; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 +; AVX512BW-NEXT: korw %k1, %k4, %k1 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm5 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $18, %k4, %k2 -; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k5 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k7 -; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k7, %k6 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload +; AVX512BW-NEXT: kshiftrd $18, %k7, %k1 +; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $13, %k7, %k6 +; AVX512BW-NEXT: kshiftrw $13, %k1, %k6 +; AVX512BW-NEXT: kmovq %k1, %k2 +; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $12, %k7, %k6 +; AVX512BW-NEXT: kshiftrw $12, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k7, %k6 +; AVX512BW-NEXT: kshiftrw $11, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k6 -; AVX512BW-NEXT: kshiftrd $19, %k4, %k5 +; AVX512BW-NEXT: kandw %k3, %k5, %k6 +; AVX512BW-NEXT: kshiftrd $19, %k7, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $10, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 @@ -10144,98 +10139,99 @@ ; AVX512BW-NEXT: kandw %k1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $9, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $8, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $6, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kandw %k0, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k5, %k5 ; AVX512BW-NEXT: korw %k5, %k6, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $3, %k2, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 -; AVX512BW-NEXT: kmovq %k3, %k7 +; AVX512BW-NEXT: kshiftrw $2, %k2, %k6 +; AVX512BW-NEXT: kmovq %k2, %k7 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k5, %k5 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k0, %k3 -; AVX512BW-NEXT: korw %k3, %k5, %k3 -; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 -; AVX512BW-NEXT: korw %k7, %k3, %k3 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm6 {%k3} {z} +; AVX512BW-NEXT: kshiftlw $14, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k5, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: korw %k7, %k2, %k2 +; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm6 {%k2} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload ; AVX512BW-NEXT: kshiftrd $16, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k3 +; AVX512BW-NEXT: kandw %k4, %k0, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k3, %k3 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k3, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k3 +; AVX512BW-NEXT: korw %k0, %k2, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k2 ; AVX512BW-NEXT: kshiftrd $17, %k1, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k3, %k3 +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 -; AVX512BW-NEXT: korw %k0, %k3, %k0 +; AVX512BW-NEXT: korw %k0, %k2, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload @@ -10247,264 +10243,264 @@ ; AVX512BW-NEXT: korw %k1, %k0, %k1 ; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $13, %k0, %k1 -; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kandw %k6, %k1, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $14, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftrd $13, %k0, %k2 +; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k3 -; AVX512BW-NEXT: kshiftrd $14, %k0, %k2 -; AVX512BW-NEXT: kmovq %k0, %k1 +; AVX512BW-NEXT: kandw %k1, %k2, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kandw %k7, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $14, %k2, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k6, %k1, %k2 +; AVX512BW-NEXT: kshiftrd $14, %k0, %k1 +; AVX512BW-NEXT: kmovq %k0, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $10, %k1, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $10, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $9, %k1, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 +; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $8, %k1, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $7, %k1, %k1 +; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k2 +; AVX512BW-NEXT: kshiftrd $15, %k6, %k5 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 +; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k5 -; AVX512BW-NEXT: korw %k5, %k3, %k3 -; AVX512BW-NEXT: kandw %k4, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k3, %k2 +; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 +; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k5 -; AVX512BW-NEXT: kshiftrd $15, %k1, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k3 -; AVX512BW-NEXT: kshiftrw $6, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 +; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $3, %k1, %k6 +; AVX512BW-NEXT: korw %k6, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k3, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 +; AVX512BW-NEXT: kandw %k0, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $2, %k1, %k6 +; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kshiftlw $14, %k5, %k5 +; AVX512BW-NEXT: korw %k5, %k2, %k2 ; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm8 {%k2} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $11, %k3, %k6 +; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm8 {%k1} {z} +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload +; AVX512BW-NEXT: kshiftrd $11, %k2, %k6 ; AVX512BW-NEXT: kmovd %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k6, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kandw %k7, %k5, %k6 -; AVX512BW-NEXT: kshiftrd $12, %k3, %k5 +; AVX512BW-NEXT: kshiftrd $12, %k2, %k5 ; AVX512BW-NEXT: kshiftlw $15, %k5, %k5 ; AVX512BW-NEXT: kshiftrw $11, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $10, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $9, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $8, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k4, %k6, %k6 +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $6, %k5, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $5, %k5, %k5 ; AVX512BW-NEXT: korw %k5, %k6, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $4, %k7, %k6 +; AVX512BW-NEXT: kandw %k4, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $4, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k7, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $3, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload ; AVX512BW-NEXT: kandw %k0, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k7, %k6 +; AVX512BW-NEXT: kshiftrw $2, %k4, %k6 +; AVX512BW-NEXT: kmovq %k4, %k0 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $14, %k0, %k1 -; AVX512BW-NEXT: korw %k1, %k5, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm9 {%k1} {z} +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 4-byte Reload +; AVX512BW-NEXT: kshiftlw $14, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k5, %k4 +; AVX512BW-NEXT: kshiftlw $1, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $1, %k4, %k4 +; AVX512BW-NEXT: korw %k0, %k4, %k4 +; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm9 {%k4} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload ; AVX512BW-NEXT: kshiftrd $9, %k6, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k0, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: korw %k5, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 +; AVX512BW-NEXT: korw %k5, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k5 -; AVX512BW-NEXT: kshiftrd $10, %k6, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k6 +; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: kandw %k1, %k4, %k5 +; AVX512BW-NEXT: kshiftrd $10, %k6, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $9, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k1, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $8, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $7, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k4, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $6, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k6 +; AVX512BW-NEXT: kandw %k3, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $5, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k5, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $2, %k4, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $4, %k4, %k6 +; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $3, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k5, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kshiftrw $2, %k1, %k5 +; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: kandw %k2, %k4, %k4 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload ; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 -; AVX512BW-NEXT: korw %k4, %k1, %k1 +; AVX512BW-NEXT: korw %k2, %k4, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: korw %k1, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z} -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $7, %k4, %k1 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload +; AVX512BW-NEXT: kshiftrd $7, %k3, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrd $6, %k4, %k5 -; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-NEXT: kmovq %k4, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k5, %k2 +; AVX512BW-NEXT: kshiftrd $6, %k3, %k2 +; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BW-NEXT: kandw %k7, %k2, %k4 ; AVX512BW-NEXT: kshiftrw $14, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $13, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $12, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $11, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $10, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: korw %k1, %k4, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k2 -; AVX512BW-NEXT: kshiftrd $8, %k6, %k1 +; AVX512BW-NEXT: kandw %k2, %k1, %k4 +; AVX512BW-NEXT: kshiftrd $8, %k3, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k5 ; AVX512BW-NEXT: kshiftrw $7, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $6, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k4, %k4 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $5, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $4, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kandw %k6, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $3, %k5, %k6 -; AVX512BW-NEXT: korw %k6, %k2, %k2 +; AVX512BW-NEXT: korw %k6, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k2, %k2 +; AVX512BW-NEXT: kandw %k6, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $2, %k5, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k2, %k2 +; AVX512BW-NEXT: kandw %k5, %k4, %k4 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k2, %k1 +; AVX512BW-NEXT: korw %k1, %k4, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 @@ -10513,120 +10509,120 @@ ; AVX512BW-NEXT: kshiftrd $4, %k6, %k1 ; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512BW-NEXT: kandw %k0, %k1, %k2 +; AVX512BW-NEXT: kandw %k0, %k1, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kandw %k1, %k4, %k4 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k2, %k2 +; AVX512BW-NEXT: korw %k5, %k4, %k4 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k2, %k5 -; AVX512BW-NEXT: kshiftrd $5, %k6, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k2, %k6 +; AVX512BW-NEXT: kandw %k1, %k4, %k5 +; AVX512BW-NEXT: kshiftrd $5, %k6, %k4 +; AVX512BW-NEXT: kshiftlw $15, %k4, %k4 +; AVX512BW-NEXT: kshiftrw $12, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $11, %k2, %k6 +; AVX512BW-NEXT: kshiftrw $11, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $10, %k2, %k6 +; AVX512BW-NEXT: kshiftrw $10, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $9, %k2, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $9, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $8, %k2, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $8, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $7, %k2, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $7, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kandw %k3, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $6, %k2, %k2 -; AVX512BW-NEXT: korw %k2, %k5, %k2 -; AVX512BW-NEXT: kandw %k4, %k2, %k5 +; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $6, %k4, %k4 +; AVX512BW-NEXT: korw %k4, %k5, %k4 +; AVX512BW-NEXT: kandw %k3, %k4, %k5 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-NEXT: kshiftlw $15, %k3, %k2 -; AVX512BW-NEXT: kshiftrw $5, %k2, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k3, %k4 +; AVX512BW-NEXT: kshiftrw $5, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kandw %k7, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $4, %k2, %k6 -; AVX512BW-NEXT: korw %k6, %k5, %k5 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $3, %k2, %k6 +; AVX512BW-NEXT: kshiftrw $4, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload ; AVX512BW-NEXT: kandw %k7, %k5, %k5 -; AVX512BW-NEXT: kshiftrw $2, %k2, %k6 +; AVX512BW-NEXT: kshiftrw $3, %k4, %k6 +; AVX512BW-NEXT: korw %k6, %k5, %k5 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k5, %k5 +; AVX512BW-NEXT: kshiftrw $2, %k4, %k6 ; AVX512BW-NEXT: korw %k6, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k5, %k5 -; AVX512BW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512BW-NEXT: korw %k3, %k5, %k3 -; AVX512BW-NEXT: kshiftlw $1, %k3, %k3 -; AVX512BW-NEXT: kshiftrw $1, %k3, %k3 -; AVX512BW-NEXT: korw %k2, %k3, %k2 +; AVX512BW-NEXT: kshiftlw $14, %k3, %k2 +; AVX512BW-NEXT: korw %k2, %k5, %k2 +; AVX512BW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm12 {%k2} {z} ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kshiftrw $14, %k4, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $13, %k4, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $12, %k4, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $11, %k4, %k3 -; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k2, %k3 -; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512BW-NEXT: kshiftrd $3, %k2, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512BW-NEXT: kshiftrw $10, %k2, %k4 -; AVX512BW-NEXT: korw %k4, %k3, %k3 +; AVX512BW-NEXT: kshiftrw $14, %k3, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $13, %k3, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $12, %k3, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $11, %k3, %k4 +; AVX512BW-NEXT: korw %k4, %k2, %k2 +; AVX512BW-NEXT: kandw %k1, %k2, %k4 +; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload +; AVX512BW-NEXT: kshiftrd $3, %k1, %k2 +; AVX512BW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512BW-NEXT: kshiftrw $10, %k2, %k3 +; AVX512BW-NEXT: korw %k3, %k4, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $9, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $8, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $7, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $6, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $5, %k2, %k4 ; AVX512BW-NEXT: korw %k4, %k3, %k3 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k3, %k3 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k3, %k3 ; AVX512BW-NEXT: kshiftrw $4, %k2, %k2 ; AVX512BW-NEXT: korw %k2, %k3, %k2 -; AVX512BW-NEXT: kandw %k1, %k2, %k2 +; AVX512BW-NEXT: kandw %k7, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 -; AVX512BW-NEXT: kandw %k7, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k2, %k2 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k3 ; AVX512BW-NEXT: korw %k3, %k2, %k2 ; AVX512BW-NEXT: kandw %k6, %k2, %k2 @@ -10938,94 +10934,95 @@ ; ; AVX512BW-LABEL: mask_replication_factor7_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovq (%rdi), %k4 ; AVX512BW-NEXT: movw $-3, %ax -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k0, %k4, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k4, %k1 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $14, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-5, %ax ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $13, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $13, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-9, %ax ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $12, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-17, %ax ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $11, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $11, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-33, %ax ; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovq %k2, %k4 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $10, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $10, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-65, %ax ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 +; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: movw $-129, %ax ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $1, %k4, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $8, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 +; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kmovq (%rdi), %k3 +; AVX512BW-NEXT: kshiftrq $1, %k3, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $8, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF ; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kmovq %k2, %k5 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kshiftrw $7, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $7, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $6, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $6, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $5, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $4, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF ; AVX512BW-NEXT: kmovd %eax, %k2 ; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k3 -; AVX512BW-NEXT: korw %k3, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $3, %k0, %k2 +; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovd %eax, %k5 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 +; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512BW-NEXT: kandw %k1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $2, %k4, %k1 +; AVX512BW-NEXT: kshiftrq $2, %k3, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -11033,27 +11030,26 @@ ; AVX512BW-NEXT: kshiftlw $15, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k0, %k6 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k6} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k7, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k7, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k7, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k7, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq %k4, %k7 -; AVX512BW-NEXT: kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: kshiftrq $3, %k4, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k1 +; AVX512BW-NEXT: kmovq %k3, %k7 +; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: kshiftrq $3, %k3, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -11061,23 +11057,24 @@ ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -11086,26 +11083,26 @@ ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: kandw %k3, %k6, %k1 +; AVX512BW-NEXT: kandw %k2, %k6, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $5, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 @@ -11119,108 +11116,108 @@ ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $6, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 +; AVX512BW-NEXT: kshiftrq $6, %k7, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 +; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k5, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 +; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k6, %k6 +; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 +; AVX512BW-NEXT: kshiftlw $14, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 -; AVX512BW-NEXT: korw %k1, %k6, %k1 -; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: korw %k0, %k6, %k6 +; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k6} {z} +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k1 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $7, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kandw %k2, %k0, %k1 ; AVX512BW-NEXT: kshiftrq $8, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $9, %k5, %k1 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $9, %k2, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k7} {z} @@ -11232,89 +11229,91 @@ ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kmovq %k5, %k4 -; AVX512BW-NEXT: kshiftrq $10, %k5, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $10, %k2, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $11, %k4, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $11, %k2, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k6, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $12, %k4, %k0 +; AVX512BW-NEXT: kmovq %k2, %k7 +; AVX512BW-NEXT: kshiftrq $12, %k2, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -11322,55 +11321,54 @@ ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $13, %k7, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 +; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512BW-NEXT: korw %k0, %k1, %k1 +; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z} +; AVX512BW-NEXT: kandw %k5, %k6, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k0, %k0 +; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $13, %k4, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k5, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 -; AVX512BW-NEXT: korw %k1, %k6, %k6 -; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k6} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $14, %k4, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $14, %k5, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -11381,94 +11379,91 @@ ; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $15, %k4, %k1 -; AVX512BW-NEXT: kmovq %k4, %k3 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $15, %k5, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k6, %k1 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z} -; AVX512BW-NEXT: kmovq %k3, %k2 -; AVX512BW-NEXT: kshiftrq $16, %k3, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $16, %k5, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload ; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $17, %k2, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $17, %k5, %k0 +; AVX512BW-NEXT: kmovq %k5, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k7, %k0, %k0 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $18, %k4, %k1 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $18, %k7, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -11480,27 +11475,29 @@ ; AVX512BW-NEXT: kandw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $19, %k4, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $19, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -11508,30 +11505,29 @@ ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $20, %k4, %k6 +; AVX512BW-NEXT: kshiftrq $20, %k7, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 @@ -11542,59 +11538,58 @@ ; AVX512BW-NEXT: kandw %k1, %k6, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $21, %k4, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $21, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $22, %k4, %k0 +; AVX512BW-NEXT: kshiftrq $22, %k7, %k0 +; AVX512BW-NEXT: kmovq %k7, %k2 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k5, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k6, %k6 +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 @@ -11603,52 +11598,54 @@ ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $23, %k4, %k0 +; AVX512BW-NEXT: kshiftrq $23, %k2, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $24, %k4, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $24, %k2, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload @@ -11661,7 +11658,7 @@ ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $25, %k4, %k1 +; AVX512BW-NEXT: kshiftrq $25, %k2, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k7} {z} @@ -11673,137 +11670,140 @@ ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $26, %k4, %k0 +; AVX512BW-NEXT: kmovq %k2, %k7 +; AVX512BW-NEXT: kshiftrq $26, %k2, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $27, %k4, %k6 +; AVX512BW-NEXT: kshiftrq $27, %k7, %k6 +; AVX512BW-NEXT: kmovq %k7, %k4 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k6, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kmovq %k4, %k7 ; AVX512BW-NEXT: kshiftrq $28, %k4, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 +; AVX512BW-NEXT: korw %k0, %k1, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $29, %k7, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 +; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512BW-NEXT: korw %k0, %k1, %k1 +; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k1} {z} +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k6, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $29, %k4, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 -; AVX512BW-NEXT: korw %k1, %k6, %k6 -; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k6} {z} -; AVX512BW-NEXT: kandw %k2, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $30, %k4, %k0 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $30, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload @@ -11818,92 +11818,92 @@ ; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $31, %k4, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 +; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $31, %k7, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k5, %k6, %k6 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k6, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $32, %k4, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $32, %k5, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $33, %k4, %k0 -; AVX512BW-NEXT: kmovq %k4, %k7 +; AVX512BW-NEXT: kshiftrq $33, %k5, %k0 +; AVX512BW-NEXT: kmovq %k5, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kandw %k3, %k0, %k0 @@ -11920,49 +11920,48 @@ ; AVX512BW-NEXT: kandw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kmovq %k3, %k7 ; AVX512BW-NEXT: kshiftrq $35, %k3, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k1 ; AVX512BW-NEXT: kshiftrq $36, %k7, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 @@ -11971,8 +11970,8 @@ ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 @@ -11987,96 +11986,96 @@ ; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k1 +; AVX512BW-NEXT: kandw %k2, %k0, %k1 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $37, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k6 ; AVX512BW-NEXT: kshiftrq $38, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 ; AVX512BW-NEXT: korw %k1, %k6, %k1 ; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k0, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k1 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $39, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftrq $40, %k7, %k0 +; AVX512BW-NEXT: kmovq %k7, %k3 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 @@ -12084,36 +12083,37 @@ ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $41, %k5, %k1 +; AVX512BW-NEXT: kshiftrq $41, %k3, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k7} {z} -; AVX512BW-NEXT: kandw %k3, %k1, %k0 +; AVX512BW-NEXT: kandw %k5, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -12124,45 +12124,42 @@ ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k1 -; AVX512BW-NEXT: kmovq %k5, %k7 -; AVX512BW-NEXT: kshiftrq $42, %k5, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $42, %k3, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $43, %k7, %k6 +; AVX512BW-NEXT: kandw %k2, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $43, %k3, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 @@ -12173,68 +12170,70 @@ ; AVX512BW-NEXT: kandw %k1, %k6, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $44, %k7, %k0 +; AVX512BW-NEXT: kmovq %k3, %k7 +; AVX512BW-NEXT: kshiftrq $44, %k3, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $45, %k7, %k0 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 -; AVX512BW-NEXT: korw %k1, %k6, %k6 -; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k6} {z} +; AVX512BW-NEXT: kmovq %k4, %k5 +; AVX512BW-NEXT: kandw %k4, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $45, %k7, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 +; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512BW-NEXT: korw %k0, %k1, %k1 +; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z} +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k6, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k0, %k0 +; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload @@ -12242,18 +12241,18 @@ ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -12261,78 +12260,76 @@ ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k5, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $47, %k7, %k1 -; AVX512BW-NEXT: kmovq %k7, %k4 -; AVX512BW-NEXT: kshiftlw $15, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $47, %k7, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k6, %k6 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k6, %k1 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k6, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k1} {z} -; AVX512BW-NEXT: kshiftrq $48, %k4, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k1 +; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload +; AVX512BW-NEXT: kshiftrq $48, %k5, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $49, %k4, %k0 -; AVX512BW-NEXT: kmovq %k4, %k7 +; AVX512BW-NEXT: kandw %k3, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $49, %k5, %k0 +; AVX512BW-NEXT: kmovq %k5, %k7 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload @@ -12343,17 +12340,17 @@ ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k0 +; AVX512BW-NEXT: kandw %k2, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $50, %k7, %k1 -; AVX512BW-NEXT: kmovq %k7, %k3 +; AVX512BW-NEXT: kmovq %k7, %k2 ; AVX512BW-NEXT: kshiftlw $14, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -12361,40 +12358,40 @@ ; AVX512BW-NEXT: kshiftlw $15, %k1, %k6 ; AVX512BW-NEXT: korw %k6, %k0, %k7 ; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm21 {%k7} {z} -; AVX512BW-NEXT: kandw %k2, %k1, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512BW-NEXT: kandw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k0, %k1 -; AVX512BW-NEXT: kmovq %k3, %k7 -; AVX512BW-NEXT: kshiftrq $51, %k3, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $51, %k2, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -12402,12 +12399,12 @@ ; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $52, %k7, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $52, %k2, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 @@ -12415,49 +12412,50 @@ ; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm22 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k6, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k6, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k0, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $53, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kandw %k3, %k0, %k6 @@ -12473,45 +12471,44 @@ ; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k4, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k6, %k6 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k6, %k6 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 ; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 ; AVX512BW-NEXT: korw %k1, %k6, %k1 ; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k1 +; AVX512BW-NEXT: kandw %k2, %k0, %k1 ; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload ; AVX512BW-NEXT: kshiftrq $55, %k7, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload @@ -12521,8 +12518,8 @@ ; AVX512BW-NEXT: kshiftlw $15, %k0, %k6 ; AVX512BW-NEXT: kshiftrw $7, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload @@ -12537,11 +12534,11 @@ ; AVX512BW-NEXT: kandw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $3, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $2, %k6, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 @@ -12558,41 +12555,40 @@ ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k4, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-NEXT: kandw %k1, %k0, %k0 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $10, %k6, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 -; AVX512BW-NEXT: kandw %k4, %k0, %k1 -; AVX512BW-NEXT: kmovq %k2, %k7 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k0, %k1 ; AVX512BW-NEXT: kshiftrq $58, %k2, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512BW-NEXT: kandw %k4, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload @@ -12601,12 +12597,12 @@ ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $59, %k7, %k6 +; AVX512BW-NEXT: kshiftrq $59, %k2, %k6 ; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 ; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 ; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 @@ -12617,71 +12613,72 @@ ; AVX512BW-NEXT: kandw %k1, %k6, %k1 ; AVX512BW-NEXT: kshiftrw $14, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512BW-NEXT: kandw %k5, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload -; AVX512BW-NEXT: kshiftrq $60, %k7, %k0 +; AVX512BW-NEXT: kmovq %k2, %k5 +; AVX512BW-NEXT: kshiftrq $60, %k2, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $11, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $10, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 -; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $7, %k0, %k6 +; AVX512BW-NEXT: korw %k6, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kandw %k3, %k0, %k6 -; AVX512BW-NEXT: kshiftrq $61, %k7, %k0 -; AVX512BW-NEXT: kmovq %k7, %k2 -; AVX512BW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512BW-NEXT: kshiftrw $4, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kandw %k4, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $3, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $2, %k1, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 +; AVX512BW-NEXT: kandw %k3, %k0, %k1 +; AVX512BW-NEXT: kshiftrq $61, %k5, %k6 +; AVX512BW-NEXT: kshiftlw $15, %k6, %k0 +; AVX512BW-NEXT: kshiftrw $4, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kandw %k4, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $3, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512BW-NEXT: kandw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k7 -; AVX512BW-NEXT: korw %k7, %k6, %k6 -; AVX512BW-NEXT: kshiftlw $1, %k6, %k6 -; AVX512BW-NEXT: kshiftrw $1, %k6, %k6 -; AVX512BW-NEXT: korw %k1, %k6, %k6 -; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k6} {z} -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512BW-NEXT: korw %k1, %k0, %k0 +; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $2, %k0, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512BW-NEXT: kandw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k6, %k7 +; AVX512BW-NEXT: korw %k7, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512BW-NEXT: korw %k0, %k1, %k1 +; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k1} {z} +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-NEXT: kandw %k1, %k6, %k1 +; AVX512BW-NEXT: kshiftrw $14, %k0, %k0 +; AVX512BW-NEXT: korw %k0, %k1, %k0 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $62, %k2, %k0 +; AVX512BW-NEXT: kshiftrq $62, %k5, %k0 ; AVX512BW-NEXT: kshiftlw $15, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $13, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 @@ -12705,32 +12702,32 @@ ; AVX512BW-NEXT: kandw %k6, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512BW-NEXT: kandw %k6, %k1, %k1 +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k0 -; AVX512BW-NEXT: kshiftrq $63, %k2, %k2 +; AVX512BW-NEXT: kshiftrq $63, %k5, %k5 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512BW-NEXT: kandw %k1, %k0, %k1 -; AVX512BW-NEXT: kshiftlw $15, %k2, %k0 +; AVX512BW-NEXT: kshiftlw $15, %k5, %k0 ; AVX512BW-NEXT: kshiftrw $6, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $5, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512BW-NEXT: kandw %k5, %k1, %k1 +; AVX512BW-NEXT: kandw %k3, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $4, %k0, %k6 ; AVX512BW-NEXT: korw %k6, %k1, %k1 ; AVX512BW-NEXT: kandw %k4, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $3, %k0, %k5 -; AVX512BW-NEXT: korw %k5, %k1, %k1 -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $2, %k0, %k4 +; AVX512BW-NEXT: kshiftrw $3, %k0, %k4 ; AVX512BW-NEXT: korw %k4, %k1, %k1 -; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512BW-NEXT: kandw %k3, %k1, %k1 -; AVX512BW-NEXT: kshiftlw $14, %k2, %k2 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $2, %k0, %k3 +; AVX512BW-NEXT: korw %k3, %k1, %k1 +; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512BW-NEXT: kandw %k2, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $14, %k5, %k2 ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512BW-NEXT: kshiftrw $1, %k1, %k1 @@ -12800,7 +12797,7 @@ ; ; AVX512BW-LABEL: mask_replication_factor8_vf2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovq (%rdi), %k1 +; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 @@ -12854,7 +12851,7 @@ ; ; AVX512BW-LABEL: mask_replication_factor8_vf4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovq (%rdi), %k0 +; AVX512BW-NEXT: kmovd (%rdi), %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0