diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1130,14 +1130,16 @@ return convertFromScalableVector(VT, Splat, DAG, Subtarget); } + unsigned NumElts = Op.getNumOperands(); + // Try and match an index sequence, which we can lower directly to the vid // instruction. An all-undef vector is matched by getSplatValue, above. if (VT.isInteger()) { bool IsVID = true; - for (unsigned i = 0, e = Op.getNumOperands(); i < e && IsVID; i++) - IsVID &= Op.getOperand(i).isUndef() || - (isa(Op.getOperand(i)) && - Op.getConstantOperandVal(i) == i); + for (unsigned I = 0; I < NumElts && IsVID; I++) + IsVID &= Op.getOperand(I).isUndef() || + (isa(Op.getOperand(I)) && + Op.getConstantOperandVal(I) == I); if (IsVID) { SDValue VID = DAG.getNode(RISCVISD::VID_VL, DL, ContainerVT, Mask, VL); @@ -1145,6 +1147,55 @@ } } + // Try and optimize BUILD_VECTORs with "dominant values" - these are values + // which constitute a large proportion of the elements. In such cases we can + // splat a vector with the dominant element and make up the shortfall with + // INSERT_VECTOR_ELTs. + // Note that this includes vectors of 2 elements by association. The + // upper-most element is the "dominant" one, allowing us to use a splat to + // "insert" the upper element, and an insert of the lower element at position + // 0, which improves codegen. + SDValue DominantValue; + DenseMap ValueCounts; + // Use a fairly conservative threshold. A future optimization could be to use + // multiple vmerge.vi/vmerge.vx instructions on "partially-dominant" + // elements with more relaxed thresholds. + unsigned NumUndefElts = + count_if(Op->op_values(), [](const SDValue &V) { return V.isUndef(); }); + unsigned NumDefElts = NumElts - NumUndefElts; + unsigned DominantValueCountThreshold = NumDefElts <= 2 ? 0 : NumDefElts - 2; + + for (SDValue V : Op->op_values()) { + if (V.isUndef()) + continue; + + ValueCounts.insert(std::make_pair(V, 0)); + unsigned &Count = ValueCounts[V]; + + // Is this value dominant? + if (++Count > DominantValueCountThreshold) + DominantValue = V; + } + + // Don't perform this optimization when optimizing for size, since + // materializing elements and inserting them tends to cause code bloat. + if (DominantValue && !DAG.shouldOptForSize()) { + unsigned Opc = + VT.isFloatingPoint() ? RISCVISD::VFMV_V_F_VL : RISCVISD::VMV_V_X_VL; + SDValue Vec = DAG.getNode(Opc, DL, ContainerVT, DominantValue, VL); + + if (ValueCounts.size() != 1) { + MVT XLenVT = Subtarget.getXLenVT(); + for (unsigned I = 0; I < NumElts; ++I) { + if (!Op.getOperand(I).isUndef() && Op.getOperand(I) != DominantValue) + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Vec, + Op.getOperand(I), DAG.getConstant(I, DL, XLenVT)); + } + } + + return convertFromScalableVector(VT, Vec, DAG, Subtarget); + } + return SDValue(); } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll @@ -612,8 +612,6 @@ ; ; LMULMAX2-RV64-LABEL: bswap_v2i64: ; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: addi sp, sp, -16 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 16 ; LMULMAX2-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; LMULMAX2-RV64-NEXT: vle64.v v25, (a0) ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 @@ -645,37 +643,37 @@ ; LMULMAX2-RV64-NEXT: slli a1, a1, 56 ; LMULMAX2-RV64-NEXT: or a1, a1, a4 ; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: or a1, a1, t1 -; LMULMAX2-RV64-NEXT: sd a1, 0(sp) -; LMULMAX2-RV64-NEXT: vsetivli a1, 1, e64,m1,ta,mu +; LMULMAX2-RV64-NEXT: or t1, a1, t1 +; LMULMAX2-RV64-NEXT: vsetivli a2, 1, e64,m1,ta,mu ; LMULMAX2-RV64-NEXT: vslidedown.vi v25, v25, 1 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 -; LMULMAX2-RV64-NEXT: srli a2, a1, 40 -; LMULMAX2-RV64-NEXT: and a2, a2, a7 -; LMULMAX2-RV64-NEXT: srli a4, a1, 56 -; LMULMAX2-RV64-NEXT: or a2, a2, a4 -; LMULMAX2-RV64-NEXT: srli a4, a1, 24 +; LMULMAX2-RV64-NEXT: vmv.x.s a2, v25 +; LMULMAX2-RV64-NEXT: srli a4, a2, 40 +; LMULMAX2-RV64-NEXT: and a4, a4, a7 +; LMULMAX2-RV64-NEXT: srli a1, a2, 56 +; LMULMAX2-RV64-NEXT: or a1, a4, a1 +; LMULMAX2-RV64-NEXT: srli a4, a2, 24 ; LMULMAX2-RV64-NEXT: and a4, a4, a6 -; LMULMAX2-RV64-NEXT: srli a5, a1, 8 +; LMULMAX2-RV64-NEXT: srli a5, a2, 8 ; LMULMAX2-RV64-NEXT: and a5, a5, t0 ; LMULMAX2-RV64-NEXT: or a4, a5, a4 -; LMULMAX2-RV64-NEXT: or a2, a4, a2 -; LMULMAX2-RV64-NEXT: slli a4, a1, 8 +; LMULMAX2-RV64-NEXT: or a1, a4, a1 +; LMULMAX2-RV64-NEXT: slli a4, a2, 8 ; LMULMAX2-RV64-NEXT: and a4, a4, t2 -; LMULMAX2-RV64-NEXT: slli a5, a1, 24 +; LMULMAX2-RV64-NEXT: slli a5, a2, 24 ; LMULMAX2-RV64-NEXT: and a5, a5, t3 ; LMULMAX2-RV64-NEXT: or a4, a5, a4 -; LMULMAX2-RV64-NEXT: slli a5, a1, 40 +; LMULMAX2-RV64-NEXT: slli a5, a2, 40 ; LMULMAX2-RV64-NEXT: and a3, a5, a3 -; LMULMAX2-RV64-NEXT: slli a1, a1, 56 -; LMULMAX2-RV64-NEXT: or a1, a1, a3 -; LMULMAX2-RV64-NEXT: or a1, a1, a4 -; LMULMAX2-RV64-NEXT: or a1, a1, a2 -; LMULMAX2-RV64-NEXT: sd a1, 8(sp) +; LMULMAX2-RV64-NEXT: slli a2, a2, 56 +; LMULMAX2-RV64-NEXT: or a2, a2, a3 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX2-RV64-NEXT: vmv.v.x v25, a1 +; LMULMAX2-RV64-NEXT: vsetvli a1, zero, e64,m1,ta,mu +; LMULMAX2-RV64-NEXT: vmv.s.x v25, t1 ; LMULMAX2-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX2-RV64-NEXT: vle64.v v25, (sp) ; LMULMAX2-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX2-RV64-NEXT: addi sp, sp, 16 ; LMULMAX2-RV64-NEXT: ret ; ; LMULMAX1-RV32-LABEL: bswap_v2i64: @@ -745,8 +743,6 @@ ; ; LMULMAX1-RV64-LABEL: bswap_v2i64: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: addi sp, sp, -16 -; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 16 ; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 @@ -778,37 +774,37 @@ ; LMULMAX1-RV64-NEXT: slli a1, a1, 56 ; LMULMAX1-RV64-NEXT: or a1, a1, a4 ; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: or a1, a1, t1 -; LMULMAX1-RV64-NEXT: sd a1, 0(sp) -; LMULMAX1-RV64-NEXT: vsetivli a1, 1, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: or t1, a1, t1 +; LMULMAX1-RV64-NEXT: vsetivli a2, 1, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 -; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 -; LMULMAX1-RV64-NEXT: srli a2, a1, 40 -; LMULMAX1-RV64-NEXT: and a2, a2, a7 -; LMULMAX1-RV64-NEXT: srli a4, a1, 56 -; LMULMAX1-RV64-NEXT: or a2, a2, a4 -; LMULMAX1-RV64-NEXT: srli a4, a1, 24 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25 +; LMULMAX1-RV64-NEXT: srli a4, a2, 40 +; LMULMAX1-RV64-NEXT: and a4, a4, a7 +; LMULMAX1-RV64-NEXT: srli a1, a2, 56 +; LMULMAX1-RV64-NEXT: or a1, a4, a1 +; LMULMAX1-RV64-NEXT: srli a4, a2, 24 ; LMULMAX1-RV64-NEXT: and a4, a4, a6 -; LMULMAX1-RV64-NEXT: srli a5, a1, 8 +; LMULMAX1-RV64-NEXT: srli a5, a2, 8 ; LMULMAX1-RV64-NEXT: and a5, a5, t0 ; LMULMAX1-RV64-NEXT: or a4, a5, a4 -; LMULMAX1-RV64-NEXT: or a2, a4, a2 -; LMULMAX1-RV64-NEXT: slli a4, a1, 8 +; LMULMAX1-RV64-NEXT: or a1, a4, a1 +; LMULMAX1-RV64-NEXT: slli a4, a2, 8 ; LMULMAX1-RV64-NEXT: and a4, a4, t2 -; LMULMAX1-RV64-NEXT: slli a5, a1, 24 +; LMULMAX1-RV64-NEXT: slli a5, a2, 24 ; LMULMAX1-RV64-NEXT: and a5, a5, t3 ; LMULMAX1-RV64-NEXT: or a4, a5, a4 -; LMULMAX1-RV64-NEXT: slli a5, a1, 40 +; LMULMAX1-RV64-NEXT: slli a5, a2, 40 ; LMULMAX1-RV64-NEXT: and a3, a5, a3 -; LMULMAX1-RV64-NEXT: slli a1, a1, 56 -; LMULMAX1-RV64-NEXT: or a1, a1, a3 -; LMULMAX1-RV64-NEXT: or a1, a1, a4 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: sd a1, 8(sp) +; LMULMAX1-RV64-NEXT: slli a2, a2, 56 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: or a1, a2, a1 +; LMULMAX1-RV64-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.v.x v25, a1 +; LMULMAX1-RV64-NEXT: vsetvli a1, zero, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.s.x v25, t1 ; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle64.v v25, (sp) ; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV64-NEXT: addi sp, sp, 16 ; LMULMAX1-RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = load <2 x i64>, <2 x i64>* %y @@ -2199,8 +2195,6 @@ ; ; LMULMAX1-RV64-LABEL: bswap_v4i64: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: addi sp, sp, -32 -; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: addi a6, a0, 16 ; LMULMAX1-RV64-NEXT: vle64.v v26, (a6) @@ -2234,33 +2228,35 @@ ; LMULMAX1-RV64-NEXT: slli a2, a2, 56 ; LMULMAX1-RV64-NEXT: or a2, a2, a3 ; LMULMAX1-RV64-NEXT: or a1, a2, a1 -; LMULMAX1-RV64-NEXT: or a1, a1, a5 -; LMULMAX1-RV64-NEXT: sd a1, 16(sp) -; LMULMAX1-RV64-NEXT: vsetivli a1, 1, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: or t4, a1, a5 +; LMULMAX1-RV64-NEXT: vsetivli a2, 1, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v26, 1 -; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX1-RV64-NEXT: srli a2, a1, 40 -; LMULMAX1-RV64-NEXT: and a2, a2, t0 -; LMULMAX1-RV64-NEXT: srli a3, a1, 56 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a1, 24 -; LMULMAX1-RV64-NEXT: and a3, a3, a7 -; LMULMAX1-RV64-NEXT: srli a5, a1, 8 -; LMULMAX1-RV64-NEXT: and a5, a5, t1 -; LMULMAX1-RV64-NEXT: or a3, a5, a3 -; LMULMAX1-RV64-NEXT: or a2, a3, a2 -; LMULMAX1-RV64-NEXT: slli a3, a1, 8 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: srli a3, a2, 40 +; LMULMAX1-RV64-NEXT: and a3, a3, t0 +; LMULMAX1-RV64-NEXT: srli a5, a2, 56 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a2, 24 +; LMULMAX1-RV64-NEXT: and a5, a5, a7 +; LMULMAX1-RV64-NEXT: srli a1, a2, 8 +; LMULMAX1-RV64-NEXT: and a1, a1, t1 +; LMULMAX1-RV64-NEXT: or a1, a1, a5 +; LMULMAX1-RV64-NEXT: or a1, a1, a3 +; LMULMAX1-RV64-NEXT: slli a3, a2, 8 ; LMULMAX1-RV64-NEXT: and a3, a3, t2 -; LMULMAX1-RV64-NEXT: slli a5, a1, 24 +; LMULMAX1-RV64-NEXT: slli a5, a2, 24 ; LMULMAX1-RV64-NEXT: and a5, a5, t3 ; LMULMAX1-RV64-NEXT: or a3, a5, a3 -; LMULMAX1-RV64-NEXT: slli a5, a1, 40 +; LMULMAX1-RV64-NEXT: slli a5, a2, 40 ; LMULMAX1-RV64-NEXT: and a5, a5, a4 -; LMULMAX1-RV64-NEXT: slli a1, a1, 56 -; LMULMAX1-RV64-NEXT: or a1, a1, a5 -; LMULMAX1-RV64-NEXT: or a1, a1, a3 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: sd a1, 24(sp) +; LMULMAX1-RV64-NEXT: slli a2, a2, 56 +; LMULMAX1-RV64-NEXT: or a2, a2, a5 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: or a1, a2, a1 +; LMULMAX1-RV64-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.v.x v26, a1 +; LMULMAX1-RV64-NEXT: vsetvli a1, zero, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.s.x v26, t4 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV64-NEXT: srli a2, a1, 40 ; LMULMAX1-RV64-NEXT: and a2, a2, t0 @@ -2282,39 +2278,38 @@ ; LMULMAX1-RV64-NEXT: slli a1, a1, 56 ; LMULMAX1-RV64-NEXT: or a1, a1, a5 ; LMULMAX1-RV64-NEXT: or a1, a1, a3 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: sd a1, 0(sp) +; LMULMAX1-RV64-NEXT: or t4, a1, a2 +; LMULMAX1-RV64-NEXT: vsetivli a2, 1, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 -; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 -; LMULMAX1-RV64-NEXT: srli a2, a1, 40 -; LMULMAX1-RV64-NEXT: and a2, a2, t0 -; LMULMAX1-RV64-NEXT: srli a3, a1, 56 -; LMULMAX1-RV64-NEXT: or a2, a2, a3 -; LMULMAX1-RV64-NEXT: srli a3, a1, 24 -; LMULMAX1-RV64-NEXT: and a3, a3, a7 -; LMULMAX1-RV64-NEXT: srli a5, a1, 8 -; LMULMAX1-RV64-NEXT: and a5, a5, t1 -; LMULMAX1-RV64-NEXT: or a3, a5, a3 -; LMULMAX1-RV64-NEXT: or a2, a3, a2 -; LMULMAX1-RV64-NEXT: slli a3, a1, 8 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25 +; LMULMAX1-RV64-NEXT: srli a3, a2, 40 +; LMULMAX1-RV64-NEXT: and a3, a3, t0 +; LMULMAX1-RV64-NEXT: srli a5, a2, 56 +; LMULMAX1-RV64-NEXT: or a3, a3, a5 +; LMULMAX1-RV64-NEXT: srli a5, a2, 24 +; LMULMAX1-RV64-NEXT: and a5, a5, a7 +; LMULMAX1-RV64-NEXT: srli a1, a2, 8 +; LMULMAX1-RV64-NEXT: and a1, a1, t1 +; LMULMAX1-RV64-NEXT: or a1, a1, a5 +; LMULMAX1-RV64-NEXT: or a1, a1, a3 +; LMULMAX1-RV64-NEXT: slli a3, a2, 8 ; LMULMAX1-RV64-NEXT: and a3, a3, t2 -; LMULMAX1-RV64-NEXT: slli a5, a1, 24 +; LMULMAX1-RV64-NEXT: slli a5, a2, 24 ; LMULMAX1-RV64-NEXT: and a5, a5, t3 ; LMULMAX1-RV64-NEXT: or a3, a5, a3 -; LMULMAX1-RV64-NEXT: slli a5, a1, 40 +; LMULMAX1-RV64-NEXT: slli a5, a2, 40 ; LMULMAX1-RV64-NEXT: and a4, a5, a4 -; LMULMAX1-RV64-NEXT: slli a1, a1, 56 -; LMULMAX1-RV64-NEXT: or a1, a1, a4 -; LMULMAX1-RV64-NEXT: or a1, a1, a3 -; LMULMAX1-RV64-NEXT: or a1, a1, a2 -; LMULMAX1-RV64-NEXT: sd a1, 8(sp) +; LMULMAX1-RV64-NEXT: slli a2, a2, 56 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: or a1, a2, a1 +; LMULMAX1-RV64-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.v.x v25, a1 +; LMULMAX1-RV64-NEXT: vsetvli a1, zero, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.s.x v25, t4 ; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle64.v v25, (sp) -; LMULMAX1-RV64-NEXT: addi a1, sp, 16 -; LMULMAX1-RV64-NEXT: vle64.v v26, (a1) ; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) ; LMULMAX1-RV64-NEXT: vse64.v v26, (a6) -; LMULMAX1-RV64-NEXT: addi sp, sp, 32 ; LMULMAX1-RV64-NEXT: ret %a = load <4 x i64>, <4 x i64>* %x %b = load <4 x i64>, <4 x i64>* %y diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll @@ -3790,11 +3790,11 @@ ; ; LMULMAX2-RV64-LABEL: ctlz_v2i64: ; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: addi sp, sp, -16 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 16 ; LMULMAX2-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; LMULMAX2-RV64-NEXT: vle64.v v25, (a0) -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 +; LMULMAX2-RV64-NEXT: vsetivli a1, 1, e64,m1,ta,mu +; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 1 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX2-RV64-NEXT: srli a2, a1, 1 ; LMULMAX2-RV64-NEXT: or a1, a1, a2 ; LMULMAX2-RV64-NEXT: srli a2, a1, 2 @@ -3850,9 +3850,8 @@ ; LMULMAX2-RV64-NEXT: addi a5, a5, 257 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sd a1, 0(sp) -; LMULMAX2-RV64-NEXT: vsetivli a1, 1, e64,m1,ta,mu -; LMULMAX2-RV64-NEXT: vslidedown.vi v25, v25, 1 +; LMULMAX2-RV64-NEXT: vsetivli a3, 2, e64,m1,ta,mu +; LMULMAX2-RV64-NEXT: vmv.v.x v26, a1 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX2-RV64-NEXT: srli a3, a1, 1 ; LMULMAX2-RV64-NEXT: or a1, a1, a3 @@ -3879,11 +3878,10 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sd a1, 8(sp) +; LMULMAX2-RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu +; LMULMAX2-RV64-NEXT: vmv.s.x v26, a1 ; LMULMAX2-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX2-RV64-NEXT: vle64.v v25, (sp) -; LMULMAX2-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX2-RV64-NEXT: addi sp, sp, 16 +; LMULMAX2-RV64-NEXT: vse64.v v26, (a0) ; LMULMAX2-RV64-NEXT: ret ; ; LMULMAX1-RV32-LABEL: ctlz_v2i64: @@ -4027,11 +4025,11 @@ ; ; LMULMAX1-RV64-LABEL: ctlz_v2i64: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: addi sp, sp, -16 -; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 16 ; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV64-NEXT: vsetivli a1, 1, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 ; LMULMAX1-RV64-NEXT: srli a2, a1, 1 ; LMULMAX1-RV64-NEXT: or a1, a1, a2 ; LMULMAX1-RV64-NEXT: srli a2, a1, 2 @@ -4087,9 +4085,8 @@ ; LMULMAX1-RV64-NEXT: addi a5, a5, 257 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sd a1, 0(sp) -; LMULMAX1-RV64-NEXT: vsetivli a1, 1, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 +; LMULMAX1-RV64-NEXT: vsetivli a3, 2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.v.x v26, a1 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV64-NEXT: srli a3, a1, 1 ; LMULMAX1-RV64-NEXT: or a1, a1, a3 @@ -4116,11 +4113,10 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sd a1, 8(sp) +; LMULMAX1-RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.s.x v26, a1 ; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle64.v v25, (sp) -; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV64-NEXT: addi sp, sp, 16 +; LMULMAX1-RV64-NEXT: vse64.v v26, (a0) ; LMULMAX1-RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = load <2 x i64>, <2 x i64>* %y @@ -11796,12 +11792,12 @@ ; ; LMULMAX1-RV64-LABEL: ctlz_v4i64: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: addi sp, sp, -32 -; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: addi a6, a0, 16 -; LMULMAX1-RV64-NEXT: vle64.v v26, (a6) +; LMULMAX1-RV64-NEXT: vle64.v v27, (a6) ; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV64-NEXT: vsetivli a2, 1, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v27, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: srli a3, a2, 1 ; LMULMAX1-RV64-NEXT: or a2, a2, a3 @@ -11858,10 +11854,9 @@ ; LMULMAX1-RV64-NEXT: addi a5, a5, 257 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sd a1, 16(sp) -; LMULMAX1-RV64-NEXT: vsetivli a1, 1, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v26, 1 -; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV64-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.v.x v26, a1 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: srli a2, a1, 1 ; LMULMAX1-RV64-NEXT: or a1, a1, a2 ; LMULMAX1-RV64-NEXT: srli a2, a1, 2 @@ -11887,8 +11882,11 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sd a1, 24(sp) -; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.s.x v26, a1 +; LMULMAX1-RV64-NEXT: vsetivli a1, 1, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v25, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 ; LMULMAX1-RV64-NEXT: srli a2, a1, 1 ; LMULMAX1-RV64-NEXT: or a1, a1, a2 ; LMULMAX1-RV64-NEXT: srli a2, a1, 2 @@ -11914,8 +11912,8 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sd a1, 0(sp) -; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 +; LMULMAX1-RV64-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.v.x v27, a1 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV64-NEXT: srli a2, a1, 1 ; LMULMAX1-RV64-NEXT: or a1, a1, a2 @@ -11942,14 +11940,11 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sd a1, 8(sp) +; LMULMAX1-RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.s.x v27, a1 ; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle64.v v25, (sp) -; LMULMAX1-RV64-NEXT: addi a1, sp, 16 -; LMULMAX1-RV64-NEXT: vle64.v v26, (a1) -; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse64.v v27, (a0) ; LMULMAX1-RV64-NEXT: vse64.v v26, (a6) -; LMULMAX1-RV64-NEXT: addi sp, sp, 32 ; LMULMAX1-RV64-NEXT: ret %a = load <4 x i64>, <4 x i64>* %x %b = load <4 x i64>, <4 x i64>* %y diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll @@ -2638,8 +2638,6 @@ ; ; LMULMAX2-RV64-LABEL: cttz_v2i64: ; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: addi sp, sp, -16 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 16 ; LMULMAX2-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; LMULMAX2-RV64-NEXT: vle64.v v25, (a0) ; LMULMAX2-RV64-NEXT: vsetivli a1, 1, e64,m1,ta,mu @@ -2690,7 +2688,8 @@ ; LMULMAX2-RV64-NEXT: addi a5, a5, 257 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sd a1, 8(sp) +; LMULMAX2-RV64-NEXT: vsetivli a3, 2, e64,m1,ta,mu +; LMULMAX2-RV64-NEXT: vmv.v.x v26, a1 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX2-RV64-NEXT: addi a3, a1, -1 ; LMULMAX2-RV64-NEXT: not a1, a1 @@ -2707,11 +2706,10 @@ ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: sd a1, 0(sp) +; LMULMAX2-RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu +; LMULMAX2-RV64-NEXT: vmv.s.x v26, a1 ; LMULMAX2-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX2-RV64-NEXT: vle64.v v25, (sp) -; LMULMAX2-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX2-RV64-NEXT: addi sp, sp, 16 +; LMULMAX2-RV64-NEXT: vse64.v v26, (a0) ; LMULMAX2-RV64-NEXT: ret ; ; LMULMAX1-RV32-LABEL: cttz_v2i64: @@ -2823,8 +2821,6 @@ ; ; LMULMAX1-RV64-LABEL: cttz_v2i64: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: addi sp, sp, -16 -; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 16 ; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) ; LMULMAX1-RV64-NEXT: vsetivli a1, 1, e64,m1,ta,mu @@ -2875,7 +2871,8 @@ ; LMULMAX1-RV64-NEXT: addi a5, a5, 257 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sd a1, 8(sp) +; LMULMAX1-RV64-NEXT: vsetivli a3, 2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.v.x v26, a1 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV64-NEXT: addi a3, a1, -1 ; LMULMAX1-RV64-NEXT: not a1, a1 @@ -2892,11 +2889,10 @@ ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sd a1, 0(sp) +; LMULMAX1-RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.s.x v26, a1 ; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle64.v v25, (sp) -; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV64-NEXT: addi sp, sp, 16 +; LMULMAX1-RV64-NEXT: vse64.v v26, (a0) ; LMULMAX1-RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = load <2 x i64>, <2 x i64>* %y @@ -8164,8 +8160,6 @@ ; ; LMULMAX1-RV64-LABEL: cttz_v4i64: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: addi sp, sp, -32 -; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 32 ; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: addi a6, a0, 16 ; LMULMAX1-RV64-NEXT: vle64.v v26, (a6) @@ -8218,11 +8212,12 @@ ; LMULMAX1-RV64-NEXT: addi a1, a1, 257 ; LMULMAX1-RV64-NEXT: mul a4, a4, a1 ; LMULMAX1-RV64-NEXT: srli a4, a4, 56 -; LMULMAX1-RV64-NEXT: sd a4, 24(sp) -; LMULMAX1-RV64-NEXT: vmv.x.s a4, v26 -; LMULMAX1-RV64-NEXT: addi a2, a4, -1 -; LMULMAX1-RV64-NEXT: not a4, a4 -; LMULMAX1-RV64-NEXT: and a2, a4, a2 +; LMULMAX1-RV64-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.v.x v27, a4 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: addi a4, a2, -1 +; LMULMAX1-RV64-NEXT: not a2, a2 +; LMULMAX1-RV64-NEXT: and a2, a2, a4 ; LMULMAX1-RV64-NEXT: srli a4, a2, 1 ; LMULMAX1-RV64-NEXT: and a4, a4, a7 ; LMULMAX1-RV64-NEXT: sub a2, a2, a4 @@ -8235,7 +8230,9 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sd a2, 16(sp) +; LMULMAX1-RV64-NEXT: vsetvli a4, zero, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.s.x v27, a2 +; LMULMAX1-RV64-NEXT: vsetivli a2, 1, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 1 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 ; LMULMAX1-RV64-NEXT: addi a4, a2, -1 @@ -8253,7 +8250,8 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: sd a2, 8(sp) +; LMULMAX1-RV64-NEXT: vsetivli a4, 2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.v.x v26, a2 ; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25 ; LMULMAX1-RV64-NEXT: addi a4, a2, -1 ; LMULMAX1-RV64-NEXT: not a2, a2 @@ -8270,14 +8268,11 @@ ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a1, a2, a1 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: sd a1, 0(sp) +; LMULMAX1-RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.s.x v26, a1 ; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle64.v v25, (sp) -; LMULMAX1-RV64-NEXT: addi a1, sp, 16 -; LMULMAX1-RV64-NEXT: vle64.v v26, (a1) -; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV64-NEXT: vse64.v v26, (a6) -; LMULMAX1-RV64-NEXT: addi sp, sp, 32 +; LMULMAX1-RV64-NEXT: vse64.v v26, (a0) +; LMULMAX1-RV64-NEXT: vse64.v v27, (a6) ; LMULMAX1-RV64-NEXT: ret %a = load <4 x i64>, <4 x i64>* %x %b = load <4 x i64>, <4 x i64>* %y diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -19,3 +19,65 @@ store <4 x float> , <4 x float>* %x ret void } + +define void @buildvec_dominant0_v4f32(<4 x float>* %x) { +; CHECK-LABEL: buildvec_dominant0_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.w.x ft0, zero +; CHECK-NEXT: lui a1, %hi(.LCPI1_0) +; CHECK-NEXT: flw ft1, %lo(.LCPI1_0)(a1) +; CHECK-NEXT: vsetvli a1, zero, e32,m1,ta,mu +; CHECK-NEXT: vfmv.s.f v25, ft0 +; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; CHECK-NEXT: vfmv.v.f v26, ft1 +; CHECK-NEXT: vsetivli a1, 3, e32,m1,tu,mu +; CHECK-NEXT: vslideup.vi v26, v25, 2 +; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; CHECK-NEXT: vse32.v v26, (a0) +; CHECK-NEXT: ret + store <4 x float> , <4 x float>* %x + ret void +} + +define void @buildvec_dominant1_v4f32(<4 x float>* %x, float %f) { +; CHECK-LABEL: buildvec_dominant1_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: fmv.w.x ft0, zero +; CHECK-NEXT: vsetvli a1, zero, e32,m1,ta,mu +; CHECK-NEXT: vfmv.s.f v25, ft0 +; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; CHECK-NEXT: vfmv.v.f v26, fa0 +; CHECK-NEXT: vsetivli a1, 2, e32,m1,tu,mu +; CHECK-NEXT: vslideup.vi v26, v25, 1 +; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; CHECK-NEXT: vse32.v v26, (a0) +; CHECK-NEXT: ret + %v0 = insertelement <4 x float> undef, float %f, i32 0 + %v1 = insertelement <4 x float> %v0, float 0.0, i32 1 + %v2 = insertelement <4 x float> %v1, float %f, i32 2 + %v3 = insertelement <4 x float> %v2, float %f, i32 3 + store <4 x float> %v3, <4 x float>* %x + ret void +} + +define void @buildvec_dominant2_v4f32(<4 x float>* %x, float %f) { +; CHECK-LABEL: buildvec_dominant2_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a1, %hi(.LCPI3_0) +; CHECK-NEXT: flw ft0, %lo(.LCPI3_0)(a1) +; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli a1, zero, e32,m1,ta,mu +; CHECK-NEXT: vfmv.s.f v26, ft0 +; CHECK-NEXT: vsetivli a1, 2, e32,m1,tu,mu +; CHECK-NEXT: vslideup.vi v25, v26, 1 +; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %v0 = insertelement <4 x float> undef, float %f, i32 0 + %v1 = insertelement <4 x float> %v0, float 2.0, i32 1 + %v2 = insertelement <4 x float> %v1, float %f, i32 2 + %v3 = insertelement <4 x float> %v2, float %f, i32 3 + store <4 x float> %v3, <4 x float>* %x + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll @@ -43,20 +43,18 @@ define void @insertelt_v3i64(<3 x i64>* %x, i64 %y) { ; RV32-LABEL: insertelt_v3i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: addi a3, a0, 16 -; RV32-NEXT: vsetivli a4, 2, e32,m1,ta,mu -; RV32-NEXT: vle32.v v25, (a3) -; RV32-NEXT: vse32.v v25, (sp) ; RV32-NEXT: vsetivli a3, 2, e64,m1,ta,mu ; RV32-NEXT: vle64.v v26, (a0) ; RV32-NEXT: vsetivli a3, 8, e32,m2,ta,mu ; RV32-NEXT: vmv.v.i v28, 0 ; RV32-NEXT: vsetivli a3, 2, e64,m2,tu,mu ; RV32-NEXT: vslideup.vi v28, v26, 0 -; RV32-NEXT: vsetivli a3, 4, e32,m1,ta,mu -; RV32-NEXT: vle32.v v26, (sp) +; RV32-NEXT: lw a3, 20(a0) +; RV32-NEXT: vsetivli a4, 4, e32,m1,ta,mu +; RV32-NEXT: lw a4, 16(a0) +; RV32-NEXT: vmv.v.x v26, a3 +; RV32-NEXT: vsetvli a3, zero, e32,m1,ta,mu +; RV32-NEXT: vmv.s.x v26, a4 ; RV32-NEXT: vsetivli a3, 4, e64,m2,tu,mu ; RV32-NEXT: vslideup.vi v28, v26, 2 ; RV32-NEXT: vsetivli a3, 2, e32,m2,ta,mu @@ -69,7 +67,6 @@ ; RV32-NEXT: vse64.v v28, (a0) ; RV32-NEXT: sw a1, 16(a0) ; RV32-NEXT: sw a2, 20(a0) -; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: insertelt_v3i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 define void @buildvec_vid_v16i8(<16 x i8>* %x) { ; CHECK-LABEL: buildvec_vid_v16i8: @@ -65,3 +65,116 @@ store <16 x i8> , <16 x i8>* %x ret void } + +define void @buildvec_dominant0_v8i16(<8 x i16>* %x) { +; CHECK-LABEL: buildvec_dominant0_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16,m1,ta,mu +; CHECK-NEXT: vmv.s.x v25, zero +; CHECK-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; CHECK-NEXT: vmv.v.i v26, 8 +; CHECK-NEXT: vsetivli a1, 4, e16,m1,tu,mu +; CHECK-NEXT: vslideup.vi v26, v25, 3 +; CHECK-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; CHECK-NEXT: vse16.v v26, (a0) +; CHECK-NEXT: ret + store <8 x i16> , <8 x i16>* %x + ret void +} + +define void @buildvec_dominant1_v8i16(<8 x i16>* %x) { +; CHECK-LABEL: buildvec_dominant1_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; CHECK-NEXT: vmv.v.i v25, 8 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + store <8 x i16> , <8 x i16>* %x + ret void +} + +define void @buildvec_dominant0_v2i8(<2 x i8>* %x) { +; CHECK-LABEL: buildvec_dominant0_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: ret + store <2 x i8> , <2 x i8>* %x + ret void +} + +define void @buildvec_dominant1_v2i8(<2 x i8>* %x) { +; CHECK-LABEL: buildvec_dominant1_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a1, 2, e8,m1,ta,mu +; CHECK-NEXT: vmv.v.i v25, -1 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + store <2 x i8> , <2 x i8>* %x + ret void +} + +define void @buildvec_dominant2_v2i8(<2 x i8>* %x) { +; CHECK-LABEL: buildvec_dominant2_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a1, 2, e8,m1,ta,mu +; CHECK-NEXT: vmv.v.i v25, -1 +; CHECK-NEXT: vsetvli a1, zero, e8,m1,ta,mu +; CHECK-NEXT: vmv.s.x v25, zero +; CHECK-NEXT: vsetivli a1, 2, e8,m1,ta,mu +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + store <2 x i8> , <2 x i8>* %x + ret void +} + +define void @buildvec_dominant0_v2i32(<2 x i64>* %x) { +; RV32-LABEL: buildvec_dominant0_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: lui a1, %hi(.LCPI10_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI10_0) +; RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; RV32-NEXT: vle32.v v25, (a1) +; RV32-NEXT: vse32.v v25, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: buildvec_dominant0_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV64-NEXT: vmv.v.i v25, -1 +; RV64-NEXT: lui a1, 3641 +; RV64-NEXT: addiw a1, a1, -455 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, -455 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, -455 +; RV64-NEXT: slli a1, a1, 13 +; RV64-NEXT: addi a1, a1, -910 +; RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu +; RV64-NEXT: vmv.s.x v25, a1 +; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV64-NEXT: vse64.v v25, (a0) +; RV64-NEXT: ret + store <2 x i64> , <2 x i64>* %x + ret void +} + +define void @buildvec_dominant1_optsize_v2i32(<2 x i64>* %x) optsize { +; RV32-LABEL: buildvec_dominant1_optsize_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: lui a1, %hi(.LCPI11_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI11_0) +; RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; RV32-NEXT: vle32.v v25, (a1) +; RV32-NEXT: vse32.v v25, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: buildvec_dominant1_optsize_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: lui a1, %hi(.LCPI11_0) +; RV64-NEXT: addi a1, a1, %lo(.LCPI11_0) +; RV64-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; RV64-NEXT: vle64.v v25, (a1) +; RV64-NEXT: vse64.v v25, (a0) +; RV64-NEXT: ret + store <2 x i64> , <2 x i64>* %x + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll @@ -872,21 +872,28 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli a1, 8, e16,m1,ta,mu ; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: addi a1, zero, 1 +; CHECK-NEXT: vsetvli a2, zero, e16,m1,ta,mu +; CHECK-NEXT: vmv.s.x v26, a1 +; CHECK-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; CHECK-NEXT: vmv.v.i v27, 0 +; CHECK-NEXT: vsetivli a1, 7, e16,m1,tu,mu +; CHECK-NEXT: vmv1r.v v28, v27 +; CHECK-NEXT: vslideup.vi v28, v26, 6 +; CHECK-NEXT: vsetivli a1, 8, e16,m1,ta,mu ; CHECK-NEXT: lui a1, %hi(.LCPI53_0) ; CHECK-NEXT: addi a1, a1, %lo(.LCPI53_0) ; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vsrl.vv v28, v25, v28 +; CHECK-NEXT: vmulhu.vv v26, v28, v26 +; CHECK-NEXT: vsub.vv v25, v25, v26 +; CHECK-NEXT: lui a1, 1048568 +; CHECK-NEXT: vsetvli a2, zero, e16,m1,ta,mu +; CHECK-NEXT: vmv.s.x v27, a1 +; CHECK-NEXT: vsetivli a1, 8, e16,m1,ta,mu ; CHECK-NEXT: lui a1, %hi(.LCPI53_1) ; CHECK-NEXT: addi a1, a1, %lo(.LCPI53_1) -; CHECK-NEXT: vle16.v v27, (a1) -; CHECK-NEXT: vsrl.vv v26, v25, v26 -; CHECK-NEXT: vmulhu.vv v26, v26, v27 -; CHECK-NEXT: lui a1, %hi(.LCPI53_2) -; CHECK-NEXT: addi a1, a1, %lo(.LCPI53_2) -; CHECK-NEXT: vle16.v v27, (a1) -; CHECK-NEXT: lui a1, %hi(.LCPI53_3) -; CHECK-NEXT: addi a1, a1, %lo(.LCPI53_3) ; CHECK-NEXT: vle16.v v28, (a1) -; CHECK-NEXT: vsub.vv v25, v25, v26 ; CHECK-NEXT: vmulhu.vv v25, v25, v27 ; CHECK-NEXT: vadd.vv v25, v25, v26 ; CHECK-NEXT: vsrl.vv v25, v25, v28 @@ -907,16 +914,26 @@ ; CHECK-NEXT: addi a1, a1, %lo(.LCPI54_0) ; CHECK-NEXT: vle32.v v26, (a1) ; CHECK-NEXT: vmulhu.vv v26, v25, v26 -; CHECK-NEXT: lui a1, %hi(.LCPI54_1) -; CHECK-NEXT: addi a1, a1, %lo(.LCPI54_1) -; CHECK-NEXT: vle32.v v27, (a1) -; CHECK-NEXT: lui a1, %hi(.LCPI54_2) -; CHECK-NEXT: addi a1, a1, %lo(.LCPI54_2) -; CHECK-NEXT: vle32.v v28, (a1) ; CHECK-NEXT: vsub.vv v25, v25, v26 -; CHECK-NEXT: vmulhu.vv v25, v25, v27 +; CHECK-NEXT: lui a1, 524288 +; CHECK-NEXT: vsetvli a2, zero, e32,m1,ta,mu +; CHECK-NEXT: vmv.s.x v27, a1 +; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; CHECK-NEXT: vmv.v.i v28, 0 +; CHECK-NEXT: vsetivli a1, 3, e32,m1,tu,mu +; CHECK-NEXT: vslideup.vi v28, v27, 2 +; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; CHECK-NEXT: vmulhu.vv v25, v25, v28 ; CHECK-NEXT: vadd.vv v25, v25, v26 -; CHECK-NEXT: vsrl.vv v25, v25, v28 +; CHECK-NEXT: addi a1, zero, 1 +; CHECK-NEXT: vsetvli a2, zero, e32,m1,ta,mu +; CHECK-NEXT: vmv.s.x v26, a1 +; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; CHECK-NEXT: vmv.v.i v27, 2 +; CHECK-NEXT: vsetivli a1, 4, e32,m1,tu,mu +; CHECK-NEXT: vslideup.vi v27, v26, 3 +; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; CHECK-NEXT: vsrl.vv v25, v25, v27 ; CHECK-NEXT: vse32.v v25, (a0) ; CHECK-NEXT: ret %a = load <4 x i32>, <4 x i32>* %x @@ -949,14 +966,33 @@ ; LMULMAX1-RV64: # %bb.0: ; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV64-NEXT: lui a1, %hi(.LCPI55_0) -; LMULMAX1-RV64-NEXT: addi a1, a1, %lo(.LCPI55_0) -; LMULMAX1-RV64-NEXT: vle64.v v26, (a1) -; LMULMAX1-RV64-NEXT: lui a1, %hi(.LCPI55_1) -; LMULMAX1-RV64-NEXT: addi a1, a1, %lo(.LCPI55_1) -; LMULMAX1-RV64-NEXT: vle64.v v27, (a1) +; LMULMAX1-RV64-NEXT: lui a1, 1035469 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -819 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, -819 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, -819 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, -819 +; LMULMAX1-RV64-NEXT: vmv.v.x v26, a1 +; LMULMAX1-RV64-NEXT: lui a1, 1026731 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -1365 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, -1365 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, -1365 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, -1365 +; LMULMAX1-RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.s.x v26, a1 +; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vmulhu.vv v25, v25, v26 -; LMULMAX1-RV64-NEXT: vsrl.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: vmv.v.i v26, 2 +; LMULMAX1-RV64-NEXT: addi a1, zero, 1 +; LMULMAX1-RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.s.x v26, a1 +; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vsrl.vv v25, v25, v26 ; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) ; LMULMAX1-RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x @@ -1051,25 +1087,31 @@ ; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) ; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; LMULMAX1-RV32-NEXT: vmul.vv v26, v25, v26 -; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI59_1) -; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI59_1) -; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v27, (a1) +; LMULMAX1-RV32-NEXT: lui a1, 349525 +; LMULMAX1-RV32-NEXT: addi a2, a1, 1365 +; LMULMAX1-RV32-NEXT: vsetivli a3, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vmv.v.x v27, a2 +; LMULMAX1-RV32-NEXT: addi a1, a1, 1366 +; LMULMAX1-RV32-NEXT: vsetvli a2, zero, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vmv.s.x v27, a1 ; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; LMULMAX1-RV32-NEXT: vmulh.vv v25, v25, v27 ; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI59_2) -; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI59_2) +; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI59_1) +; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI59_1) ; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu ; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) ; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; LMULMAX1-RV32-NEXT: vsrl.vv v26, v25, v26 -; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI59_3) -; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI59_3) -; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v27, (a1) +; LMULMAX1-RV32-NEXT: addi a1, zero, 1 +; LMULMAX1-RV32-NEXT: vsetvli a2, zero, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vmv.s.x v27, a1 +; LMULMAX1-RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vmv.v.i v28, 0 +; LMULMAX1-RV32-NEXT: vsetivli a1, 3, e32,m1,tu,mu +; LMULMAX1-RV32-NEXT: vslideup.vi v28, v27, 2 ; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vsra.vv v25, v25, v27 +; LMULMAX1-RV32-NEXT: vsra.vv v25, v25, v28 ; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v26 ; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) ; LMULMAX1-RV32-NEXT: ret @@ -1078,13 +1120,24 @@ ; LMULMAX1-RV64: # %bb.0: ; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV64-NEXT: lui a1, %hi(.LCPI59_0) -; LMULMAX1-RV64-NEXT: addi a1, a1, %lo(.LCPI59_0) -; LMULMAX1-RV64-NEXT: vle64.v v26, (a1) -; LMULMAX1-RV64-NEXT: lui a1, %hi(.LCPI59_1) -; LMULMAX1-RV64-NEXT: addi a1, a1, %lo(.LCPI59_1) -; LMULMAX1-RV64-NEXT: vle64.v v27, (a1) +; LMULMAX1-RV64-NEXT: vmv.v.i v26, -1 +; LMULMAX1-RV64-NEXT: vsetvli a1, zero, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.s.x v26, zero +; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vmul.vv v26, v25, v26 +; LMULMAX1-RV64-NEXT: lui a1, 21845 +; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a2, a1, 1365 +; LMULMAX1-RV64-NEXT: vmv.v.x v27, a2 +; LMULMAX1-RV64-NEXT: addi a1, a1, 1366 +; LMULMAX1-RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.s.x v27, a1 +; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vmulh.vv v25, v25, v27 ; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v26 ; LMULMAX1-RV64-NEXT: addi a1, zero, 63 @@ -3983,21 +4036,31 @@ ; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) ; LMULMAX1-RV32-NEXT: vle32.v v27, (a0) ; LMULMAX1-RV32-NEXT: vmulhu.vv v28, v25, v26 -; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI131_1) -; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI131_1) -; LMULMAX1-RV32-NEXT: vle32.v v29, (a2) -; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI131_2) -; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI131_2) -; LMULMAX1-RV32-NEXT: vle32.v v30, (a2) ; LMULMAX1-RV32-NEXT: vsub.vv v25, v25, v28 -; LMULMAX1-RV32-NEXT: vmulhu.vv v25, v25, v29 +; LMULMAX1-RV32-NEXT: lui a2, 524288 +; LMULMAX1-RV32-NEXT: vsetvli a3, zero, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vmv.s.x v29, a2 +; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vmv.v.i v30, 0 +; LMULMAX1-RV32-NEXT: vsetivli a2, 3, e32,m1,tu,mu +; LMULMAX1-RV32-NEXT: vslideup.vi v30, v29, 2 +; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vmulhu.vv v25, v25, v30 ; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v28 -; LMULMAX1-RV32-NEXT: vsrl.vv v25, v25, v30 +; LMULMAX1-RV32-NEXT: addi a2, zero, 1 +; LMULMAX1-RV32-NEXT: vsetvli a3, zero, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vmv.s.x v28, a2 +; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vmv.v.i v29, 2 +; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,tu,mu +; LMULMAX1-RV32-NEXT: vslideup.vi v29, v28, 3 +; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vsrl.vv v25, v25, v29 ; LMULMAX1-RV32-NEXT: vmulhu.vv v26, v27, v26 ; LMULMAX1-RV32-NEXT: vsub.vv v27, v27, v26 -; LMULMAX1-RV32-NEXT: vmulhu.vv v27, v27, v29 +; LMULMAX1-RV32-NEXT: vmulhu.vv v27, v27, v30 ; LMULMAX1-RV32-NEXT: vadd.vv v26, v27, v26 -; LMULMAX1-RV32-NEXT: vsrl.vv v26, v26, v30 +; LMULMAX1-RV32-NEXT: vsrl.vv v26, v26, v29 ; LMULMAX1-RV32-NEXT: vse32.v v26, (a0) ; LMULMAX1-RV32-NEXT: vse32.v v25, (a1) ; LMULMAX1-RV32-NEXT: ret @@ -4047,34 +4110,75 @@ ; ; LMULMAX1-RV64-LABEL: mulhu_v4i64: ; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 2 ; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) ; LMULMAX1-RV64-NEXT: addi a1, a0, 16 -; LMULMAX1-RV64-NEXT: vle64.v v25, (a1) -; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI132_0) -; LMULMAX1-RV64-NEXT: addi a2, a2, %lo(.LCPI132_0) -; LMULMAX1-RV64-NEXT: vle64.v v26, (a2) -; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI132_1) -; LMULMAX1-RV64-NEXT: addi a2, a2, %lo(.LCPI132_1) -; LMULMAX1-RV64-NEXT: vle64.v v27, (a2) -; LMULMAX1-RV64-NEXT: vle64.v v28, (a0) -; LMULMAX1-RV64-NEXT: vmulhu.vv v26, v25, v26 -; LMULMAX1-RV64-NEXT: vsub.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vle64.v v26, (a1) +; LMULMAX1-RV64-NEXT: lui a3, 1044935 +; LMULMAX1-RV64-NEXT: addiw a3, a3, 455 +; LMULMAX1-RV64-NEXT: slli a3, a3, 12 +; LMULMAX1-RV64-NEXT: addi a3, a3, 455 +; LMULMAX1-RV64-NEXT: slli a3, a3, 12 +; LMULMAX1-RV64-NEXT: addi a3, a3, 455 +; LMULMAX1-RV64-NEXT: slli a3, a3, 13 +; LMULMAX1-RV64-NEXT: addi a3, a3, 911 +; LMULMAX1-RV64-NEXT: vmv.v.x v27, a3 +; LMULMAX1-RV64-NEXT: lui a3, 4681 +; LMULMAX1-RV64-NEXT: addiw a3, a3, 585 +; LMULMAX1-RV64-NEXT: slli a3, a3, 12 +; LMULMAX1-RV64-NEXT: addi a3, a3, 585 +; LMULMAX1-RV64-NEXT: slli a3, a3, 12 +; LMULMAX1-RV64-NEXT: addi a3, a3, 585 +; LMULMAX1-RV64-NEXT: slli a3, a3, 13 +; LMULMAX1-RV64-NEXT: addi a3, a3, 1171 +; LMULMAX1-RV64-NEXT: vsetvli a4, zero, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.s.x v27, a3 +; LMULMAX1-RV64-NEXT: vsetivli a3, 2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmulhu.vv v27, v26, v27 +; LMULMAX1-RV64-NEXT: vsub.vv v26, v26, v27 +; LMULMAX1-RV64-NEXT: vmv.v.i v28, 0 +; LMULMAX1-RV64-NEXT: addi a3, zero, -1 +; LMULMAX1-RV64-NEXT: slli a3, a3, 63 +; LMULMAX1-RV64-NEXT: vsetvli a4, zero, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.s.x v28, a3 +; LMULMAX1-RV64-NEXT: vsetivli a3, 2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmulhu.vv v26, v26, v28 +; LMULMAX1-RV64-NEXT: vadd.vv v26, v26, v27 +; LMULMAX1-RV64-NEXT: vmv.v.i v27, 3 +; LMULMAX1-RV64-NEXT: vsetvli a3, zero, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.s.x v27, a2 +; LMULMAX1-RV64-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vsrl.vv v26, v26, v27 +; LMULMAX1-RV64-NEXT: lui a2, 1035469 +; LMULMAX1-RV64-NEXT: addiw a2, a2, -819 +; LMULMAX1-RV64-NEXT: slli a2, a2, 12 +; LMULMAX1-RV64-NEXT: addi a2, a2, -819 +; LMULMAX1-RV64-NEXT: slli a2, a2, 12 +; LMULMAX1-RV64-NEXT: addi a2, a2, -819 +; LMULMAX1-RV64-NEXT: slli a2, a2, 12 +; LMULMAX1-RV64-NEXT: addi a2, a2, -819 +; LMULMAX1-RV64-NEXT: vmv.v.x v27, a2 +; LMULMAX1-RV64-NEXT: lui a2, 1026731 +; LMULMAX1-RV64-NEXT: addiw a2, a2, -1365 +; LMULMAX1-RV64-NEXT: slli a2, a2, 12 +; LMULMAX1-RV64-NEXT: addi a2, a2, -1365 +; LMULMAX1-RV64-NEXT: slli a2, a2, 12 +; LMULMAX1-RV64-NEXT: addi a2, a2, -1365 +; LMULMAX1-RV64-NEXT: slli a2, a2, 12 +; LMULMAX1-RV64-NEXT: addi a2, a2, -1365 +; LMULMAX1-RV64-NEXT: vsetvli a3, zero, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.s.x v27, a2 +; LMULMAX1-RV64-NEXT: vsetivli a2, 2, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vmulhu.vv v25, v25, v27 -; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI132_2) -; LMULMAX1-RV64-NEXT: addi a2, a2, %lo(.LCPI132_2) -; LMULMAX1-RV64-NEXT: vle64.v v27, (a2) -; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI132_3) -; LMULMAX1-RV64-NEXT: addi a2, a2, %lo(.LCPI132_3) -; LMULMAX1-RV64-NEXT: vle64.v v29, (a2) -; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI132_4) -; LMULMAX1-RV64-NEXT: addi a2, a2, %lo(.LCPI132_4) -; LMULMAX1-RV64-NEXT: vle64.v v30, (a2) -; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vmv.v.i v27, 2 +; LMULMAX1-RV64-NEXT: addi a2, zero, 1 +; LMULMAX1-RV64-NEXT: vsetvli a3, zero, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.s.x v27, a2 +; LMULMAX1-RV64-NEXT: vsetivli a2, 2, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vsrl.vv v25, v25, v27 -; LMULMAX1-RV64-NEXT: vmulhu.vv v26, v28, v29 -; LMULMAX1-RV64-NEXT: vsrl.vv v26, v26, v30 -; LMULMAX1-RV64-NEXT: vse64.v v26, (a0) -; LMULMAX1-RV64-NEXT: vse64.v v25, (a1) +; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse64.v v26, (a1) ; LMULMAX1-RV64-NEXT: ret %a = load <4 x i64>, <4 x i64>* %x %b = udiv <4 x i64> %a, @@ -4247,31 +4351,42 @@ ; LMULMAX1-RV64-LABEL: mulhs_v4i64: ; LMULMAX1-RV64: # %bb.0: ; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) ; LMULMAX1-RV64-NEXT: addi a1, a0, 16 -; LMULMAX1-RV64-NEXT: vle64.v v25, (a1) -; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI136_0) -; LMULMAX1-RV64-NEXT: addi a2, a2, %lo(.LCPI136_0) -; LMULMAX1-RV64-NEXT: vle64.v v26, (a2) -; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI136_1) -; LMULMAX1-RV64-NEXT: addi a2, a2, %lo(.LCPI136_1) -; LMULMAX1-RV64-NEXT: vle64.v v27, (a2) -; LMULMAX1-RV64-NEXT: vle64.v v28, (a0) -; LMULMAX1-RV64-NEXT: vmul.vv v29, v25, v26 -; LMULMAX1-RV64-NEXT: vmulh.vv v25, v25, v27 -; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v29 +; LMULMAX1-RV64-NEXT: vle64.v v26, (a1) +; LMULMAX1-RV64-NEXT: vmv.v.i v27, -1 +; LMULMAX1-RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.s.x v27, zero +; LMULMAX1-RV64-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmul.vv v28, v26, v27 +; LMULMAX1-RV64-NEXT: lui a2, 21845 +; LMULMAX1-RV64-NEXT: addiw a2, a2, 1365 +; LMULMAX1-RV64-NEXT: slli a2, a2, 12 +; LMULMAX1-RV64-NEXT: addi a2, a2, 1365 +; LMULMAX1-RV64-NEXT: slli a2, a2, 12 +; LMULMAX1-RV64-NEXT: addi a2, a2, 1365 +; LMULMAX1-RV64-NEXT: slli a2, a2, 12 +; LMULMAX1-RV64-NEXT: addi a3, a2, 1365 +; LMULMAX1-RV64-NEXT: vmv.v.x v29, a3 +; LMULMAX1-RV64-NEXT: addi a2, a2, 1366 +; LMULMAX1-RV64-NEXT: vsetvli a3, zero, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.s.x v29, a2 +; LMULMAX1-RV64-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmulh.vv v26, v26, v29 +; LMULMAX1-RV64-NEXT: vadd.vv v26, v26, v28 ; LMULMAX1-RV64-NEXT: addi a2, zero, 63 -; LMULMAX1-RV64-NEXT: vsrl.vx v29, v25, a2 +; LMULMAX1-RV64-NEXT: vsrl.vx v28, v26, a2 ; LMULMAX1-RV64-NEXT: vid.v v30 -; LMULMAX1-RV64-NEXT: vsra.vv v25, v25, v30 -; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v29 -; LMULMAX1-RV64-NEXT: vmul.vv v26, v28, v26 -; LMULMAX1-RV64-NEXT: vmulh.vv v27, v28, v27 -; LMULMAX1-RV64-NEXT: vadd.vv v26, v27, v26 -; LMULMAX1-RV64-NEXT: vsrl.vx v27, v26, a2 ; LMULMAX1-RV64-NEXT: vsra.vv v26, v26, v30 -; LMULMAX1-RV64-NEXT: vadd.vv v26, v26, v27 -; LMULMAX1-RV64-NEXT: vse64.v v26, (a0) -; LMULMAX1-RV64-NEXT: vse64.v v25, (a1) +; LMULMAX1-RV64-NEXT: vadd.vv v26, v26, v28 +; LMULMAX1-RV64-NEXT: vmul.vv v27, v25, v27 +; LMULMAX1-RV64-NEXT: vmulh.vv v25, v25, v29 +; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: vsrl.vx v27, v25, a2 +; LMULMAX1-RV64-NEXT: vsra.vv v25, v25, v30 +; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse64.v v26, (a1) ; LMULMAX1-RV64-NEXT: ret %a = load <4 x i64>, <4 x i64>* %x %b = sdiv <4 x i64> %a,