diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -19859,6 +19859,44 @@ return DAG.getBitcast(VT, DAG.getBuildVector(VecVT, DL, Ops)); } +// Attempt to merge nested concat_vectors/undefs. +// Fold concat_vectors(concat_vectors(x,y,z,w),u,u,concat_vectors(a,b,c,d)) +// --> concat_vectors(x,y,z,w,u,u,u,u,u,u,u,u,a,b,c,d) +static SDValue combineConcatVectorOfConcatVectors(SDNode *N, + SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + + // Ensure we're concatenating UNDEF and CONCAT_VECTORS nodes of similar types. + EVT SubVT; + SDValue FirstConcat; + for (const SDValue &Op : N->ops()) { + if (Op.isUndef()) + continue; + if (Op.getOpcode() != ISD::CONCAT_VECTORS) + return SDValue(); + if (!FirstConcat) { + SubVT = Op.getOperand(0).getValueType(); + if (!DAG.getTargetLoweringInfo().isTypeLegal(SubVT)) + return SDValue(); + FirstConcat = Op; + continue; + } + if (SubVT != Op.getOperand(0).getValueType()) + return SDValue(); + } + assert(FirstConcat && "Concat of all-undefs found"); + + SmallVector ConcatOps; + for (const SDValue &Op : N->ops()) { + if (Op.isUndef()) { + ConcatOps.append(FirstConcat->getNumOperands(), DAG.getUNDEF(SubVT)); + continue; + } + ConcatOps.append(Op->op_begin(), Op->op_end()); + } + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, ConcatOps); +} + // Check to see if this is a CONCAT_VECTORS of a bunch of EXTRACT_SUBVECTOR // operations. If so, and if the EXTRACT_SUBVECTOR vector inputs come from at // most two distinct vectors the same size as the result, attempt to turn this @@ -20122,9 +20160,13 @@ return V; // Fold CONCAT_VECTORS of EXTRACT_SUBVECTOR (or undef) to VECTOR_SHUFFLE. - if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) + if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) { + if (SDValue V = combineConcatVectorOfConcatVectors(N, DAG)) + return V; + if (SDValue V = combineConcatVectorOfExtracts(N, DAG)) return V; + } if (SDValue V = combineConcatVectorOfCasts(N, DAG)) return V; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10420,8 +10420,29 @@ isTypeLegal(Op.getValueType()) && "Expected legal scalable vector type!"); - if (isTypeLegal(Op.getOperand(0).getValueType()) && Op.getNumOperands() == 2) - return Op; + if (isTypeLegal(Op.getOperand(0).getValueType())) { + unsigned NumOperands = Op->getNumOperands(); + assert(NumOperands > 1 && isPowerOf2_32(NumOperands) && + "Unexpected number of operands in CONCAT_VECTORS"); + + if (Op.getNumOperands() == 2) + return Op; + + // Concat each pair of subvectors and pack into the lower half of the array. + SmallVector ConcatOps(Op->op_begin(), Op->op_end()); + while (ConcatOps.size() > 1) { + for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) { + SDValue V1 = ConcatOps[I]; + SDValue V2 = ConcatOps[I + 1]; + EVT SubVT = V1.getValueType(); + EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext()); + ConcatOps[I / 2] = + DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2); + } + ConcatOps.resize(ConcatOps.size() / 2); + } + return ConcatOps[0]; + } return SDValue(); } @@ -13584,7 +13605,7 @@ // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector // splat. The indexed instructions are going to be expecting a DUPLANE64, so // canonicalise to that. - if (N0 == N1 && VT.getVectorNumElements() == 2) { + if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) { assert(VT.getScalarSizeInBits() == 64); return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG), DAG.getConstant(0, dl, MVT::i64)); @@ -13599,7 +13620,7 @@ // becomes // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS)) - if (N1Opc != ISD::BITCAST) + if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST) return SDValue(); SDValue RHS = N1->getOperand(0); MVT RHSTy = RHS.getValueType().getSimpleVT(); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -8824,54 +8824,68 @@ static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) { - SDValue V1 = Op.getOperand(0); - SDValue V2 = Op.getOperand(1); SDLoc dl(Op); - EVT VT = Op.getValueType(); - EVT Op1VT = V1.getValueType(); - EVT Op2VT = V2.getValueType(); - unsigned NumElts = VT.getVectorNumElements(); - - assert(Op1VT == Op2VT && "Operand types don't match!"); - assert(VT.getScalarSizeInBits() == 1 && + assert(Op.getValueType().getScalarSizeInBits() == 1 && + "Unexpected custom CONCAT_VECTORS lowering"); + assert(isPowerOf2_32(Op.getNumOperands()) && "Unexpected custom CONCAT_VECTORS lowering"); assert(ST->hasMVEIntegerOps() && "CONCAT_VECTORS lowering only supported for MVE"); - SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); - SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG); - - // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets - // promoted to v8i16, etc. - - MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); - - // Extract the vector elements from Op1 and Op2 one by one and truncate them - // to be the right size for the destination. For example, if Op1 is v4i1 then - // the promoted vector is v4i32. The result of concatentation gives a v8i1, - // which when promoted is v8i16. That means each i32 element from Op1 needs - // truncating to i16 and inserting in the result. - EVT ConcatVT = MVT::getVectorVT(ElType, NumElts); - SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT); - auto ExractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) { - EVT NewVT = NewV.getValueType(); - EVT ConcatVT = ConVec.getValueType(); - for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) { - SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV, - DAG.getIntPtrConstant(i, dl)); - ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt, - DAG.getConstant(j, dl, MVT::i32)); - } - return ConVec; + auto ConcatPair = [&](SDValue V1, SDValue V2) { + EVT Op1VT = V1.getValueType(); + EVT Op2VT = V2.getValueType(); + assert(Op1VT == Op2VT && "Operand types don't match!"); + EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext()); + + SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); + SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG); + + // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets + // promoted to v8i16, etc. + MVT ElType = + getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); + unsigned NumElts = 2 * Op1VT.getVectorNumElements(); + + // Extract the vector elements from Op1 and Op2 one by one and truncate them + // to be the right size for the destination. For example, if Op1 is v4i1 + // then the promoted vector is v4i32. The result of concatentation gives a + // v8i1, which when promoted is v8i16. That means each i32 element from Op1 + // needs truncating to i16 and inserting in the result. + EVT ConcatVT = MVT::getVectorVT(ElType, NumElts); + SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT); + auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) { + EVT NewVT = NewV.getValueType(); + EVT ConcatVT = ConVec.getValueType(); + for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) { + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV, + DAG.getIntPtrConstant(i, dl)); + ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt, + DAG.getConstant(j, dl, MVT::i32)); + } + return ConVec; + }; + unsigned j = 0; + ConVec = ExtractInto(NewV1, ConVec, j); + ConVec = ExtractInto(NewV2, ConVec, j); + + // Now return the result of comparing the subvector with zero, + // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. + return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec, + DAG.getConstant(ARMCC::NE, dl, MVT::i32)); }; - unsigned j = 0; - ConVec = ExractInto(NewV1, ConVec, j); - ConVec = ExractInto(NewV2, ConVec, j); - // Now return the result of comparing the subvector with zero, - // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. - return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec, - DAG.getConstant(ARMCC::NE, dl, MVT::i32)); + // Concat each pair of subvectors and pack into the lower half of the array. + SmallVector ConcatOps(Op->op_begin(), Op->op_end()); + while (ConcatOps.size() > 1) { + for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) { + SDValue V1 = ConcatOps[I]; + SDValue V2 = ConcatOps[I + 1]; + ConcatOps[I / 2] = ConcatPair(V1, V2); + } + ConcatOps.resize(ConcatOps.size() / 2); + } + return ConcatOps[0]; } static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll @@ -1290,37 +1290,30 @@ ; LMULMAX1-NEXT: vnsrl.wi v25, v8, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 -; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; LMULMAX1-NEXT: vmv.v.i v26, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; LMULMAX1-NEXT: vmv1r.v v27, v26 -; LMULMAX1-NEXT: vslideup.vi v27, v25, 0 +; LMULMAX1-NEXT: vsetivli zero, 4, e8, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v26, v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v9, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 -; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v27, v25, 4 -; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; LMULMAX1-NEXT: vmv.v.i v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, tu, mu -; LMULMAX1-NEXT: vslideup.vi v25, v27, 0 +; LMULMAX1-NEXT: vslideup.vi v26, v25, 4 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v10, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v10, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v27, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 +; LMULMAX1-NEXT: vsetivli zero, 12, e8, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v26, v25, 8 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v11, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v11, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 -; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v27, 4 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, tu, mu -; LMULMAX1-NEXT: vslideup.vi v25, v26, 8 +; LMULMAX1-NEXT: vslideup.vi v26, v25, 12 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; LMULMAX1-NEXT: vse8.v v25, (a0) +; LMULMAX1-NEXT: vse8.v v26, (a0) ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: truncstore_v16i32_v16i8: @@ -1624,43 +1617,36 @@ ; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; LMULMAX1-NEXT: vmv.v.i v26, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vmv1r.v v27, v26 -; LMULMAX1-NEXT: vslideup.vi v27, v25, 0 +; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; LMULMAX1-NEXT: vslideup.vi v26, v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v9, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v27, v25, 2 -; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; LMULMAX1-NEXT: vmv.v.i v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v25, v27, 0 +; LMULMAX1-NEXT: vslideup.vi v26, v25, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v10, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v10, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v27, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 +; LMULMAX1-NEXT: vsetivli zero, 6, e8, mf2, tu, mu +; LMULMAX1-NEXT: vslideup.vi v26, v25, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v11, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v11, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v27, 2 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v25, v26, 4 +; LMULMAX1-NEXT: vslideup.vi v26, v25, 6 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; LMULMAX1-NEXT: vse8.v v25, (a0) +; LMULMAX1-NEXT: vse8.v v26, (a0) ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: truncstore_v8i64_v8i8: @@ -1685,37 +1671,30 @@ ; LMULMAX1-NEXT: vnsrl.wi v25, v8, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, mu +; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; LMULMAX1-NEXT: vmv.v.i v26, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, tu, mu -; LMULMAX1-NEXT: vmv1r.v v27, v26 -; LMULMAX1-NEXT: vslideup.vi v27, v25, 0 +; LMULMAX1-NEXT: vsetivli zero, 2, e16, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v26, v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v9, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v27, v25, 2 -; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; LMULMAX1-NEXT: vmv.v.i v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, tu, mu -; LMULMAX1-NEXT: vslideup.vi v25, v27, 0 +; LMULMAX1-NEXT: vslideup.vi v26, v25, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v10, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v10, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v27, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 +; LMULMAX1-NEXT: vsetivli zero, 6, e16, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v26, v25, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v11, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v11, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v27, 2 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, tu, mu -; LMULMAX1-NEXT: vslideup.vi v25, v26, 4 +; LMULMAX1-NEXT: vslideup.vi v26, v25, 6 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; LMULMAX1-NEXT: vse16.v v25, (a0) +; LMULMAX1-NEXT: vse16.v v26, (a0) ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: truncstore_v8i64_v8i16: @@ -1779,88 +1758,68 @@ ; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v26, v25, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, mu ; LMULMAX1-NEXT: vmv.v.i v25, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vmv1r.v v27, v25 -; LMULMAX1-NEXT: vslideup.vi v27, v26, 0 +; LMULMAX1-NEXT: vsetivli zero, 2, e8, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v25, v26, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v26, v9, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v27, v26, 2 -; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; LMULMAX1-NEXT: vmv.v.i v26, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; LMULMAX1-NEXT: vmv1r.v v28, v26 -; LMULMAX1-NEXT: vslideup.vi v28, v27, 0 +; LMULMAX1-NEXT: vsetivli zero, 4, e8, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v25, v26, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v10, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v10, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vmv1r.v v29, v25 -; LMULMAX1-NEXT: vslideup.vi v29, v27, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 +; LMULMAX1-NEXT: vsetivli zero, 6, e8, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v25, v26, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v11, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v11, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v29, v27, 2 -; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v28, v29, 4 -; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; LMULMAX1-NEXT: vmv.v.i v27, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, tu, mu -; LMULMAX1-NEXT: vslideup.vi v27, v28, 0 +; LMULMAX1-NEXT: vslideup.vi v25, v26, 6 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v12, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v12, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vmv1r.v v29, v25 -; LMULMAX1-NEXT: vslideup.vi v29, v28, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 +; LMULMAX1-NEXT: vsetivli zero, 10, e8, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v25, v26, 8 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v13, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v13, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v29, v28, 2 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v29, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 +; LMULMAX1-NEXT: vsetivli zero, 12, e8, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v25, v26, 10 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v14, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v14, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v25, v28, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 +; LMULMAX1-NEXT: vsetivli zero, 14, e8, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v25, v26, 12 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v15, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v15, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v25, v28, 2 -; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v25, 4 +; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 ; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, tu, mu -; LMULMAX1-NEXT: vslideup.vi v27, v26, 8 +; LMULMAX1-NEXT: vslideup.vi v25, v26, 14 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, m1, ta, mu -; LMULMAX1-NEXT: vse8.v v27, (a0) +; LMULMAX1-NEXT: vse8.v v25, (a0) ; LMULMAX1-NEXT: ret ; ; LMULMAX4-LABEL: truncstore_v16i64_v16i8: @@ -1897,67 +1856,54 @@ ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v8, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v26, v25, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; LMULMAX1-NEXT: vmv.v.i v25, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, tu, mu -; LMULMAX1-NEXT: vmv1r.v v28, v25 -; LMULMAX1-NEXT: vslideup.vi v28, v26, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v26, v9, 0 -; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v28, v26, 2 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; LMULMAX1-NEXT: vmv.v.i v26, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, tu, mu +; LMULMAX1-NEXT: vsetivli zero, 2, e16, m1, tu, mu ; LMULMAX1-NEXT: vmv1r.v v27, v26 -; LMULMAX1-NEXT: vslideup.vi v27, v28, 0 +; LMULMAX1-NEXT: vslideup.vi v27, v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v10, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v9, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, tu, mu -; LMULMAX1-NEXT: vmv1r.v v29, v25 -; LMULMAX1-NEXT: vslideup.vi v29, v28, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 +; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v27, v25, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v11, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v10, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v29, v28, 2 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 +; LMULMAX1-NEXT: vsetivli zero, 6, e16, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v27, v25, 4 +; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; LMULMAX1-NEXT: vnsrl.wi v25, v11, 0 +; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, tu, mu -; LMULMAX1-NEXT: vslideup.vi v27, v29, 4 +; LMULMAX1-NEXT: vslideup.vi v27, v25, 6 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v12, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v12, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, tu, mu -; LMULMAX1-NEXT: vmv1r.v v29, v25 -; LMULMAX1-NEXT: vslideup.vi v29, v28, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 +; LMULMAX1-NEXT: vsetivli zero, 2, e16, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v26, v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v13, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v13, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v29, v28, 2 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v29, 0 +; LMULMAX1-NEXT: vslideup.vi v26, v25, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v14, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v14, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v25, v28, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 +; LMULMAX1-NEXT: vsetivli zero, 6, e16, m1, tu, mu +; LMULMAX1-NEXT: vslideup.vi v26, v25, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v15, 0 +; LMULMAX1-NEXT: vnsrl.wi v25, v15, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v28, v28, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v25, v28, 2 +; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v25, 4 +; LMULMAX1-NEXT: vslideup.vi v26, v25, 6 ; LMULMAX1-NEXT: addi a1, a0, 16 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; LMULMAX1-NEXT: vse16.v v26, (a1) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll @@ -202,15 +202,15 @@ ; ; LMULMAX1-LABEL: fpround_v8f64_v8f16: ; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: addi sp, sp, -48 -; LMULMAX1-NEXT: .cfi_def_cfa_offset 48 +; LMULMAX1-NEXT: addi sp, sp, -32 +; LMULMAX1-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vle64.v v25, (a0) -; LMULMAX1-NEXT: addi a2, a0, 32 +; LMULMAX1-NEXT: addi a2, a0, 16 ; LMULMAX1-NEXT: vle64.v v26, (a2) ; LMULMAX1-NEXT: addi a2, a0, 48 ; LMULMAX1-NEXT: vle64.v v27, (a2) -; LMULMAX1-NEXT: addi a0, a0, 16 +; LMULMAX1-NEXT: addi a0, a0, 32 ; LMULMAX1-NEXT: vle64.v v28, (a0) ; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfncvt.rod.f.f.w v29, v27 @@ -222,35 +222,25 @@ ; LMULMAX1-NEXT: vfncvt.rod.f.f.w v27, v28 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vfncvt.f.f.w v28, v27 -; LMULMAX1-NEXT: addi a0, sp, 20 +; LMULMAX1-NEXT: addi a0, sp, 24 ; LMULMAX1-NEXT: vse16.v v28, (a0) ; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfncvt.rod.f.f.w v27, v26 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vfncvt.f.f.w v26, v27 -; LMULMAX1-NEXT: addi a0, sp, 24 -; LMULMAX1-NEXT: vse16.v v26, (a0) -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; LMULMAX1-NEXT: addi a0, sp, 24 -; LMULMAX1-NEXT: vle16.v v26, (a0) -; LMULMAX1-NEXT: addi a0, sp, 40 +; LMULMAX1-NEXT: addi a0, sp, 20 ; LMULMAX1-NEXT: vse16.v v26, (a0) -; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfncvt.rod.f.f.w v26, v25 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vfncvt.f.f.w v25, v26 ; LMULMAX1-NEXT: addi a0, sp, 16 ; LMULMAX1-NEXT: vse16.v v25, (a0) -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; LMULMAX1-NEXT: addi a0, sp, 16 -; LMULMAX1-NEXT: vle16.v v25, (a0) -; LMULMAX1-NEXT: addi a0, sp, 32 -; LMULMAX1-NEXT: vse16.v v25, (a0) ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; LMULMAX1-NEXT: addi a0, sp, 32 +; LMULMAX1-NEXT: addi a0, sp, 16 ; LMULMAX1-NEXT: vle16.v v25, (a0) ; LMULMAX1-NEXT: vse16.v v25, (a1) -; LMULMAX1-NEXT: addi sp, sp, 48 +; LMULMAX1-NEXT: addi sp, sp, 32 ; LMULMAX1-NEXT: ret %a = load <8 x double>, <8 x double>* %x %d = fptrunc <8 x double> %a to <8 x half> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll @@ -456,43 +456,36 @@ ; LMULMAX1-NEXT: vnsrl.wi v27, v29, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; LMULMAX1-NEXT: vmv.v.i v29, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vmv1r.v v30, v29 -; LMULMAX1-NEXT: vslideup.vi v30, v27, 0 +; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; LMULMAX1-NEXT: vslideup.vi v29, v27, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfncvt.rtz.x.f.w v27, v28 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v30, v27, 2 -; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; LMULMAX1-NEXT: vmv.v.i v27, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v27, v30, 0 +; LMULMAX1-NEXT: vslideup.vi v29, v27, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vfncvt.rtz.x.f.w v28, v26 +; LMULMAX1-NEXT: vfncvt.rtz.x.f.w v27, v26 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v26, v28, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v27, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v29, v26, 0 +; LMULMAX1-NEXT: vsetivli zero, 6, e8, mf2, tu, mu +; LMULMAX1-NEXT: vslideup.vi v29, v26, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfncvt.rtz.x.f.w v26, v25 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v26, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v29, v25, 2 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v27, v29, 4 +; LMULMAX1-NEXT: vslideup.vi v29, v25, 6 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; LMULMAX1-NEXT: vse8.v v27, (a1) +; LMULMAX1-NEXT: vse8.v v29, (a1) ; LMULMAX1-NEXT: ret %a = load <8 x double>, <8 x double>* %x %d = fptosi <8 x double> %a to <8 x i8> @@ -530,43 +523,36 @@ ; LMULMAX1-NEXT: vnsrl.wi v27, v29, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; LMULMAX1-NEXT: vmv.v.i v29, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vmv1r.v v30, v29 -; LMULMAX1-NEXT: vslideup.vi v30, v27, 0 +; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; LMULMAX1-NEXT: vslideup.vi v29, v27, 0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfncvt.rtz.xu.f.w v27, v28 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v27, v27, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v30, v27, 2 -; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; LMULMAX1-NEXT: vmv.v.i v27, 0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v27, v30, 0 +; LMULMAX1-NEXT: vslideup.vi v29, v27, 2 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vfncvt.rtz.xu.f.w v28, v26 +; LMULMAX1-NEXT: vfncvt.rtz.xu.f.w v27, v26 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu -; LMULMAX1-NEXT: vnsrl.wi v26, v28, 0 +; LMULMAX1-NEXT: vnsrl.wi v26, v27, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v26, v26, 0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v29, v26, 0 +; LMULMAX1-NEXT: vsetivli zero, 6, e8, mf2, tu, mu +; LMULMAX1-NEXT: vslideup.vi v29, v26, 4 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfncvt.rtz.xu.f.w v26, v25 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v26, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vnsrl.wi v25, v25, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v29, v25, 2 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v27, v29, 4 +; LMULMAX1-NEXT: vslideup.vi v29, v25, 6 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; LMULMAX1-NEXT: vse8.v v27, (a1) +; LMULMAX1-NEXT: vse8.v v29, (a1) ; LMULMAX1-NEXT: ret %a = load <8 x double>, <8 x double>* %x %d = fptoui <8 x double> %a to <8 x i8> @@ -585,7 +571,7 @@ ; ; LMULMAX1-LABEL: fp2si_v8f64_v8i1: ; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; LMULMAX1-NEXT: vmv.v.i v25, 0 ; LMULMAX1-NEXT: vmclr.m v0 ; LMULMAX1-NEXT: vmerge.vim v26, v25, 1, v0 @@ -596,58 +582,43 @@ ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vmv.v.i v27, 0 ; LMULMAX1-NEXT: vmerge.vim v28, v27, 1, v0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vmv1r.v v29, v26 -; LMULMAX1-NEXT: vslideup.vi v29, v28, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; LMULMAX1-NEXT: vmsne.vi v0, v29, 0 -; LMULMAX1-NEXT: vmerge.vim v28, v25, 1, v0 +; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; LMULMAX1-NEXT: vslideup.vi v26, v28, 0 +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; LMULMAX1-NEXT: vmsne.vi v0, v26, 0 +; LMULMAX1-NEXT: vmerge.vim v26, v25, 1, v0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vfncvt.rtz.x.f.w v29, v9 -; LMULMAX1-NEXT: vand.vi v29, v29, 1 -; LMULMAX1-NEXT: vmsne.vi v0, v29, 0 -; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vmerge.vim v29, v27, 1, v0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v28, v29, 2 -; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, mu +; LMULMAX1-NEXT: vfncvt.rtz.x.f.w v28, v9 +; LMULMAX1-NEXT: vand.vi v28, v28, 1 ; LMULMAX1-NEXT: vmsne.vi v0, v28, 0 -; LMULMAX1-NEXT: vmerge.vim v28, v25, 1, v0 -; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; LMULMAX1-NEXT: vmv.v.i v29, 0 -; LMULMAX1-NEXT: vmclr.m v0 -; LMULMAX1-NEXT: vmerge.vim v30, v29, 1, v0 +; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu +; LMULMAX1-NEXT: vmerge.vim v28, v27, 1, v0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v30, v28, 0 +; LMULMAX1-NEXT: vslideup.vi v26, v28, 2 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; LMULMAX1-NEXT: vmsne.vi v0, v30, 0 -; LMULMAX1-NEXT: vmerge.vim v28, v29, 1, v0 -; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vfncvt.rtz.x.f.w v29, v10 -; LMULMAX1-NEXT: vand.vi v29, v29, 1 -; LMULMAX1-NEXT: vmsne.vi v0, v29, 0 -; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vmerge.vim v29, v27, 1, v0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v29, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; LMULMAX1-NEXT: vmsne.vi v0, v26, 0 ; LMULMAX1-NEXT: vmerge.vim v26, v25, 1, v0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vfncvt.rtz.x.f.w v29, v11 -; LMULMAX1-NEXT: vand.vi v29, v29, 1 -; LMULMAX1-NEXT: vmsne.vi v0, v29, 0 +; LMULMAX1-NEXT: vfncvt.rtz.x.f.w v28, v10 +; LMULMAX1-NEXT: vand.vi v28, v28, 1 +; LMULMAX1-NEXT: vmsne.vi v0, v28, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vmerge.vim v27, v27, 1, v0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v27, 2 -; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, mu +; LMULMAX1-NEXT: vmerge.vim v28, v27, 1, v0 +; LMULMAX1-NEXT: vsetivli zero, 6, e8, mf2, tu, mu +; LMULMAX1-NEXT: vslideup.vi v26, v28, 4 +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; LMULMAX1-NEXT: vmsne.vi v0, v26, 0 ; LMULMAX1-NEXT: vmerge.vim v25, v25, 1, v0 +; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; LMULMAX1-NEXT: vfncvt.rtz.x.f.w v26, v11 +; LMULMAX1-NEXT: vand.vi v26, v26, 1 +; LMULMAX1-NEXT: vmsne.vi v0, v26, 0 +; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu +; LMULMAX1-NEXT: vmerge.vim v26, v27, 1, v0 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v28, v25, 4 +; LMULMAX1-NEXT: vslideup.vi v25, v26, 6 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; LMULMAX1-NEXT: vmsne.vi v0, v28, 0 +; LMULMAX1-NEXT: vmsne.vi v0, v25, 0 ; LMULMAX1-NEXT: ret %z = fptosi <8 x double> %x to <8 x i1> ret <8 x i1> %z @@ -664,7 +635,7 @@ ; ; LMULMAX1-LABEL: fp2ui_v8f64_v8i1: ; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; LMULMAX1-NEXT: vmv.v.i v25, 0 ; LMULMAX1-NEXT: vmclr.m v0 ; LMULMAX1-NEXT: vmerge.vim v26, v25, 1, v0 @@ -675,58 +646,43 @@ ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu ; LMULMAX1-NEXT: vmv.v.i v27, 0 ; LMULMAX1-NEXT: vmerge.vim v28, v27, 1, v0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vmv1r.v v29, v26 -; LMULMAX1-NEXT: vslideup.vi v29, v28, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu -; LMULMAX1-NEXT: vmsne.vi v0, v29, 0 -; LMULMAX1-NEXT: vmerge.vim v28, v25, 1, v0 +; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf2, tu, mu +; LMULMAX1-NEXT: vslideup.vi v26, v28, 0 +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; LMULMAX1-NEXT: vmsne.vi v0, v26, 0 +; LMULMAX1-NEXT: vmerge.vim v26, v25, 1, v0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vfncvt.rtz.xu.f.w v29, v9 -; LMULMAX1-NEXT: vand.vi v29, v29, 1 -; LMULMAX1-NEXT: vmsne.vi v0, v29, 0 -; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vmerge.vim v29, v27, 1, v0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v28, v29, 2 -; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, mu +; LMULMAX1-NEXT: vfncvt.rtz.xu.f.w v28, v9 +; LMULMAX1-NEXT: vand.vi v28, v28, 1 ; LMULMAX1-NEXT: vmsne.vi v0, v28, 0 -; LMULMAX1-NEXT: vmerge.vim v28, v25, 1, v0 -; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; LMULMAX1-NEXT: vmv.v.i v29, 0 -; LMULMAX1-NEXT: vmclr.m v0 -; LMULMAX1-NEXT: vmerge.vim v30, v29, 1, v0 +; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu +; LMULMAX1-NEXT: vmerge.vim v28, v27, 1, v0 ; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v30, v28, 0 +; LMULMAX1-NEXT: vslideup.vi v26, v28, 2 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; LMULMAX1-NEXT: vmsne.vi v0, v30, 0 -; LMULMAX1-NEXT: vmerge.vim v28, v29, 1, v0 -; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vfncvt.rtz.xu.f.w v29, v10 -; LMULMAX1-NEXT: vand.vi v29, v29, 1 -; LMULMAX1-NEXT: vmsne.vi v0, v29, 0 -; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vmerge.vim v29, v27, 1, v0 -; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v29, 0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, mu ; LMULMAX1-NEXT: vmsne.vi v0, v26, 0 ; LMULMAX1-NEXT: vmerge.vim v26, v25, 1, v0 ; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; LMULMAX1-NEXT: vfncvt.rtz.xu.f.w v29, v11 -; LMULMAX1-NEXT: vand.vi v29, v29, 1 -; LMULMAX1-NEXT: vmsne.vi v0, v29, 0 +; LMULMAX1-NEXT: vfncvt.rtz.xu.f.w v28, v10 +; LMULMAX1-NEXT: vand.vi v28, v28, 1 +; LMULMAX1-NEXT: vmsne.vi v0, v28, 0 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; LMULMAX1-NEXT: vmerge.vim v27, v27, 1, v0 -; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, tu, mu -; LMULMAX1-NEXT: vslideup.vi v26, v27, 2 -; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, mu +; LMULMAX1-NEXT: vmerge.vim v28, v27, 1, v0 +; LMULMAX1-NEXT: vsetivli zero, 6, e8, mf2, tu, mu +; LMULMAX1-NEXT: vslideup.vi v26, v28, 4 +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, mu ; LMULMAX1-NEXT: vmsne.vi v0, v26, 0 ; LMULMAX1-NEXT: vmerge.vim v25, v25, 1, v0 +; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; LMULMAX1-NEXT: vfncvt.rtz.xu.f.w v26, v11 +; LMULMAX1-NEXT: vand.vi v26, v26, 1 +; LMULMAX1-NEXT: vmsne.vi v0, v26, 0 +; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf8, ta, mu +; LMULMAX1-NEXT: vmerge.vim v26, v27, 1, v0 ; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, tu, mu -; LMULMAX1-NEXT: vslideup.vi v28, v25, 4 +; LMULMAX1-NEXT: vslideup.vi v25, v26, 6 ; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; LMULMAX1-NEXT: vmsne.vi v0, v28, 0 +; LMULMAX1-NEXT: vmsne.vi v0, v25, 0 ; LMULMAX1-NEXT: ret %z = fptoui <8 x double> %x to <8 x i1> ret <8 x i1> %z diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll @@ -466,15 +466,15 @@ ; ; LMULMAX1-LABEL: si2fp_v8i64_v8f16: ; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: addi sp, sp, -48 -; LMULMAX1-NEXT: .cfi_def_cfa_offset 48 +; LMULMAX1-NEXT: addi sp, sp, -32 +; LMULMAX1-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vle64.v v25, (a0) -; LMULMAX1-NEXT: addi a2, a0, 32 +; LMULMAX1-NEXT: addi a2, a0, 16 ; LMULMAX1-NEXT: vle64.v v26, (a2) ; LMULMAX1-NEXT: addi a2, a0, 48 ; LMULMAX1-NEXT: vle64.v v27, (a2) -; LMULMAX1-NEXT: addi a0, a0, 16 +; LMULMAX1-NEXT: addi a0, a0, 32 ; LMULMAX1-NEXT: vle64.v v28, (a0) ; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfncvt.f.x.w v29, v27 @@ -486,35 +486,25 @@ ; LMULMAX1-NEXT: vfncvt.f.x.w v27, v28 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vfncvt.f.f.w v28, v27 -; LMULMAX1-NEXT: addi a0, sp, 20 +; LMULMAX1-NEXT: addi a0, sp, 24 ; LMULMAX1-NEXT: vse16.v v28, (a0) ; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfncvt.f.x.w v27, v26 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vfncvt.f.f.w v26, v27 -; LMULMAX1-NEXT: addi a0, sp, 24 -; LMULMAX1-NEXT: vse16.v v26, (a0) -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; LMULMAX1-NEXT: addi a0, sp, 24 -; LMULMAX1-NEXT: vle16.v v26, (a0) -; LMULMAX1-NEXT: addi a0, sp, 40 +; LMULMAX1-NEXT: addi a0, sp, 20 ; LMULMAX1-NEXT: vse16.v v26, (a0) -; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfncvt.f.x.w v26, v25 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vfncvt.f.f.w v25, v26 ; LMULMAX1-NEXT: addi a0, sp, 16 ; LMULMAX1-NEXT: vse16.v v25, (a0) -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; LMULMAX1-NEXT: addi a0, sp, 16 -; LMULMAX1-NEXT: vle16.v v25, (a0) -; LMULMAX1-NEXT: addi a0, sp, 32 -; LMULMAX1-NEXT: vse16.v v25, (a0) ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; LMULMAX1-NEXT: addi a0, sp, 32 +; LMULMAX1-NEXT: addi a0, sp, 16 ; LMULMAX1-NEXT: vle16.v v25, (a0) ; LMULMAX1-NEXT: vse16.v v25, (a1) -; LMULMAX1-NEXT: addi sp, sp, 48 +; LMULMAX1-NEXT: addi sp, sp, 32 ; LMULMAX1-NEXT: ret %a = load <8 x i64>, <8 x i64>* %x %d = sitofp <8 x i64> %a to <8 x half> @@ -536,15 +526,15 @@ ; ; LMULMAX1-LABEL: ui2fp_v8i64_v8f16: ; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: addi sp, sp, -48 -; LMULMAX1-NEXT: .cfi_def_cfa_offset 48 +; LMULMAX1-NEXT: addi sp, sp, -32 +; LMULMAX1-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; LMULMAX1-NEXT: vle64.v v25, (a0) -; LMULMAX1-NEXT: addi a2, a0, 32 +; LMULMAX1-NEXT: addi a2, a0, 16 ; LMULMAX1-NEXT: vle64.v v26, (a2) ; LMULMAX1-NEXT: addi a2, a0, 48 ; LMULMAX1-NEXT: vle64.v v27, (a2) -; LMULMAX1-NEXT: addi a0, a0, 16 +; LMULMAX1-NEXT: addi a0, a0, 32 ; LMULMAX1-NEXT: vle64.v v28, (a0) ; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfncvt.f.xu.w v29, v27 @@ -556,35 +546,25 @@ ; LMULMAX1-NEXT: vfncvt.f.xu.w v27, v28 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vfncvt.f.f.w v28, v27 -; LMULMAX1-NEXT: addi a0, sp, 20 +; LMULMAX1-NEXT: addi a0, sp, 24 ; LMULMAX1-NEXT: vse16.v v28, (a0) ; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfncvt.f.xu.w v27, v26 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vfncvt.f.f.w v26, v27 -; LMULMAX1-NEXT: addi a0, sp, 24 -; LMULMAX1-NEXT: vse16.v v26, (a0) -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; LMULMAX1-NEXT: addi a0, sp, 24 -; LMULMAX1-NEXT: vle16.v v26, (a0) -; LMULMAX1-NEXT: addi a0, sp, 40 +; LMULMAX1-NEXT: addi a0, sp, 20 ; LMULMAX1-NEXT: vse16.v v26, (a0) -; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, mu ; LMULMAX1-NEXT: vfncvt.f.xu.w v26, v25 ; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; LMULMAX1-NEXT: vfncvt.f.f.w v25, v26 ; LMULMAX1-NEXT: addi a0, sp, 16 ; LMULMAX1-NEXT: vse16.v v25, (a0) -; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, mu -; LMULMAX1-NEXT: addi a0, sp, 16 -; LMULMAX1-NEXT: vle16.v v25, (a0) -; LMULMAX1-NEXT: addi a0, sp, 32 -; LMULMAX1-NEXT: vse16.v v25, (a0) ; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; LMULMAX1-NEXT: addi a0, sp, 32 +; LMULMAX1-NEXT: addi a0, sp, 16 ; LMULMAX1-NEXT: vle16.v v25, (a0) ; LMULMAX1-NEXT: vse16.v v25, (a1) -; LMULMAX1-NEXT: addi sp, sp, 48 +; LMULMAX1-NEXT: addi sp, sp, 32 ; LMULMAX1-NEXT: ret %a = load <8 x i64>, <8 x i64>* %x %d = uitofp <8 x i64> %a to <8 x half> diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll --- a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll @@ -2267,33 +2267,31 @@ ; RV64-LABEL: mgather_baseidx_nxv32i8: ; RV64: # %bb.0: ; RV64-NEXT: vmv1r.v v25, v0 +; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu +; RV64-NEXT: vsext.vf8 v16, v8 +; RV64-NEXT: vsetvli zero, zero, e8, m1, tu, mu +; RV64-NEXT: vluxei64.v v12, (a0), v16, v0.t ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: srli a2, a1, 2 -; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, mu -; RV64-NEXT: vslidedown.vx v26, v0, a2 -; RV64-NEXT: srli a1, a1, 3 -; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, mu -; RV64-NEXT: vslidedown.vx v0, v26, a1 -; RV64-NEXT: vsetvli a2, zero, e64, m8, ta, mu -; RV64-NEXT: vsext.vf8 v16, v11 +; RV64-NEXT: srli a2, a1, 3 +; RV64-NEXT: vsetvli a3, zero, e8, mf4, ta, mu +; RV64-NEXT: vslidedown.vx v0, v0, a2 +; RV64-NEXT: vsetvli a3, zero, e64, m8, ta, mu +; RV64-NEXT: vsext.vf8 v16, v9 ; RV64-NEXT: vsetvli zero, zero, e8, m1, tu, mu -; RV64-NEXT: vluxei64.v v15, (a0), v16, v0.t -; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vluxei64.v v13, (a0), v16, v0.t +; RV64-NEXT: srli a1, a1, 2 +; RV64-NEXT: vsetvli a3, zero, e8, mf2, ta, mu +; RV64-NEXT: vslidedown.vx v0, v25, a1 +; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu ; RV64-NEXT: vsext.vf8 v16, v10 ; RV64-NEXT: vsetvli zero, zero, e8, m1, tu, mu -; RV64-NEXT: vmv1r.v v0, v26 ; RV64-NEXT: vluxei64.v v14, (a0), v16, v0.t -; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: vsext.vf8 v16, v8 -; RV64-NEXT: vsetvli zero, zero, e8, m1, tu, mu -; RV64-NEXT: vmv1r.v v0, v25 -; RV64-NEXT: vluxei64.v v12, (a0), v16, v0.t -; RV64-NEXT: vsetvli a2, zero, e8, mf4, ta, mu -; RV64-NEXT: vslidedown.vx v0, v25, a1 +; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, mu +; RV64-NEXT: vslidedown.vx v0, v0, a2 ; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, mu -; RV64-NEXT: vsext.vf8 v16, v9 +; RV64-NEXT: vsext.vf8 v16, v11 ; RV64-NEXT: vsetvli zero, zero, e8, m1, tu, mu -; RV64-NEXT: vluxei64.v v13, (a0), v16, v0.t +; RV64-NEXT: vluxei64.v v15, (a0), v16, v0.t ; RV64-NEXT: vmv4r.v v8, v12 ; RV64-NEXT: ret %ptrs = getelementptr inbounds i8, i8* %base, %idxs diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll @@ -8,17 +8,13 @@ define <8 x double> @merge_8f64_2f64_12u4(<2 x double>* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8f64_2f64_12u4: ; ALL: # %bb.0: -; ALL-NEXT: vmovups 16(%rdi), %ymm0 -; ALL-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm1 -; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: vmovups 16(%rdi), %zmm0 ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_8f64_2f64_12u4: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovups 16(%eax), %ymm0 -; X86-AVX512F-NEXT: vinsertf128 $1, 64(%eax), %ymm0, %ymm1 -; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; X86-AVX512F-NEXT: vmovups 16(%eax), %zmm0 ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 1 %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2 @@ -35,19 +31,15 @@ define <8 x double> @merge_8f64_2f64_23z5(<2 x double>* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8f64_2f64_23z5: ; ALL: # %bb.0: -; ALL-NEXT: vmovups 32(%rdi), %ymm0 -; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vinsertf128 $1, 80(%rdi), %ymm1, %ymm1 -; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: vmovdqu64 32(%rdi), %zmm0 +; ALL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_8f64_2f64_23z5: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovups 32(%eax), %ymm0 -; X86-AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X86-AVX512F-NEXT: vinsertf128 $1, 80(%eax), %ymm1, %ymm1 -; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; X86-AVX512F-NEXT: vmovdqu64 32(%eax), %zmm0 +; X86-AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2 %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 3 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -575,9 +575,7 @@ define <16 x float> @insert_sub2_4(<16 x float> %base, <4 x float> %sub1, <4 x float> %sub2, <4 x float> %sub3, <4 x float> %sub4) { ; ALL-LABEL: insert_sub2_4: ; ALL: # %bb.0: -; ALL-NEXT: vinsertf32x4 $2, %xmm3, %zmm0, %zmm1 -; ALL-NEXT: vmovapd {{.*#+}} zmm2 = [0,1,12,13,4,5,6,7] -; ALL-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 +; ALL-NEXT: vinsertf32x4 $1, %xmm3, %zmm0, %zmm0 ; ALL-NEXT: retq %sub12 = shufflevector <4 x float> %sub1, <4 x float> %sub2, <8 x i32> %sub34 = shufflevector <4 x float> %sub3, <4 x float> %sub4, <8 x i32> diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -369,8 +369,8 @@ ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdi) ; AVX512-NEXT: vzeroupper @@ -1467,15 +1467,15 @@ ; AVX512-NEXT: vmovdqu 96(%rdi), %xmm3 ; AVX512-NEXT: vmovdqu 112(%rdi), %xmm4 ; AVX512-NEXT: vmovdqu 128(%rdi), %xmm5 +; AVX512-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 ; AVX512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3 ; AVX512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3 ; AVX512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 -; AVX512-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 -; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4 -; AVX512-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5 -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX512-NEXT: vpshufb %zmm3, %zmm0, %zmm0 ; AVX512-NEXT: vpshufb %zmm3, %zmm1, %zmm1