diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -1138,6 +1138,44 @@ ReplaceNode(Node, Extract.getNode()); return; } + case RISCVISD::VMV_V_X_VL: + case RISCVISD::VFMV_V_F_VL: { + // Try to match splat of a scalar load to a strided load with stride of x0. + SDValue Src = Node->getOperand(0); + auto *Ld = dyn_cast(Src); + if (!Ld) + break; + EVT MemVT = Ld->getMemoryVT(); + // The memory VT should be the same size as the element type. + if (MemVT.getStoreSize() != VT.getVectorElementType().getStoreSize()) + break; + if (!IsProfitableToFold(Src, Node, Node) || + !IsLegalToFold(Src, Node, Node, TM.getOptLevel())) + break; + + SDValue VL; + selectVLOp(Node->getOperand(1), VL); + + unsigned ScalarSize = VT.getScalarSizeInBits(); + SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT); + + SDValue Operands[] = {Ld->getBasePtr(), + CurDAG->getRegister(RISCV::X0, XLenVT), VL, SEW, + Ld->getChain()}; + + RISCVVLMUL LMUL = RISCVTargetLowering::getLMUL(VT); + const RISCV::VLEPseudo *P = RISCV::getVLEPseudo( + /*IsMasked*/ false, /*IsStrided*/ true, /*FF*/ false, ScalarSize, + static_cast(LMUL)); + MachineSDNode *Load = + CurDAG->getMachineNode(P->Pseudo, DL, Node->getVTList(), Operands); + + if (auto *MemOp = dyn_cast(Node)) + CurDAG->setNodeMemRefs(Load, {MemOp->getMemOperand()}); + + ReplaceNode(Node, Load); + return; + } } // Select the default instruction. diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1632,17 +1632,40 @@ SDValue NewAddr = DAG.getMemBasePlusOffset(Ld->getBasePtr(), TypeSize::Fixed(Offset), DL); - SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other}); - SDValue IntID = - DAG.getTargetConstant(Intrinsic::riscv_vlse, DL, XLenVT); - SDValue Ops[] = {Ld->getChain(), IntID, NewAddr, - DAG.getRegister(RISCV::X0, XLenVT), VL}; - SDValue NewLoad = DAG.getMemIntrinsicNode( - ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, SVT, - DAG.getMachineFunction().getMachineMemOperand( - Ld->getMemOperand(), Offset, SVT.getStoreSize())); - DAG.makeEquivalentMemoryOrdering(Ld, NewLoad); - return convertFromScalableVector(VT, NewLoad, DAG, Subtarget); + // If this is SEW=64 on RV32, use a strided load with a stride of x0. + if (SVT.isInteger() && SVT.bitsGT(XLenVT)) { + SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other}); + SDValue IntID = + DAG.getTargetConstant(Intrinsic::riscv_vlse, DL, XLenVT); + SDValue Ops[] = {Ld->getChain(), IntID, NewAddr, + DAG.getRegister(RISCV::X0, XLenVT), VL}; + SDValue NewLoad = DAG.getMemIntrinsicNode( + ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, SVT, + DAG.getMachineFunction().getMachineMemOperand( + Ld->getMemOperand(), Offset, SVT.getStoreSize())); + DAG.makeEquivalentMemoryOrdering(Ld, NewLoad); + return convertFromScalableVector(VT, NewLoad, DAG, Subtarget); + } + + // Otherwise use a scalar load and splat. This will give the best + // opportunity to fold a splat into the operation. ISel can turn it into + // the x0 strided load if we aren't able to fold away the select. + if (SVT.isFloatingPoint()) + V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, + Ld->getPointerInfo().getWithOffset(Offset), + Ld->getOriginalAlign(), + Ld->getMemOperand()->getFlags()); + else + V = DAG.getExtLoad(ISD::SEXTLOAD, DL, XLenVT, Ld->getChain(), NewAddr, + Ld->getPointerInfo().getWithOffset(Offset), SVT, + Ld->getOriginalAlign(), + Ld->getMemOperand()->getFlags()); + DAG.makeEquivalentMemoryOrdering(Ld, V); + + unsigned Opc = + VT.isFloatingPoint() ? RISCVISD::VFMV_V_F_VL : RISCVISD::VMV_V_X_VL; + SDValue Splat = DAG.getNode(Opc, DL, ContainerVT, V, VL); + return convertFromScalableVector(VT, Splat, DAG, Subtarget); } V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -23,16 +23,16 @@ define void @buildvec_dominant0_v4f32(<4 x float>* %x) { ; CHECK-LABEL: buildvec_dominant0_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI1_0) -; CHECK-NEXT: flw ft0, %lo(.LCPI1_0)(a1) -; CHECK-NEXT: fmv.w.x ft1, zero ; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu -; CHECK-NEXT: vfmv.s.f v25, ft1 -; CHECK-NEXT: vfmv.v.f v26, ft0 +; CHECK-NEXT: lui a1, %hi(.LCPI1_0) +; CHECK-NEXT: addi a1, a1, %lo(.LCPI1_0) +; CHECK-NEXT: vlse32.v v25, (a1), zero +; CHECK-NEXT: fmv.w.x ft0, zero +; CHECK-NEXT: vfmv.s.f v26, ft0 ; CHECK-NEXT: vsetivli a1, 3, e32,m1,tu,mu -; CHECK-NEXT: vslideup.vi v26, v25, 2 +; CHECK-NEXT: vslideup.vi v25, v26, 2 ; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu -; CHECK-NEXT: vse32.v v26, (a0) +; CHECK-NEXT: vse32.v v25, (a0) ; CHECK-NEXT: ret store <4 x float> , <4 x float>* %x ret void diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll @@ -159,12 +159,12 @@ ; RV32-LABEL: vrgather_shuffle_xv_v4f64: ; RV32: # %bb.0: ; RV32-NEXT: addi a0, zero, 12 -; RV32-NEXT: lui a1, %hi(.LCPI7_0) -; RV32-NEXT: fld ft0, %lo(.LCPI7_0)(a1) ; RV32-NEXT: vsetivli a1, 1, e8,m1,ta,mu ; RV32-NEXT: vmv.s.x v0, a0 -; RV32-NEXT: vsetivli a0, 4, e64,m2,ta,mu -; RV32-NEXT: vfmv.v.f v26, ft0 +; RV32-NEXT: lui a0, %hi(.LCPI7_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI7_0) +; RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; RV32-NEXT: vlse64.v v26, (a0), zero ; RV32-NEXT: lui a0, %hi(.LCPI7_1) ; RV32-NEXT: addi a0, a0, %lo(.LCPI7_1) ; RV32-NEXT: vsetivli a1, 4, e16,m1,ta,mu @@ -181,11 +181,11 @@ ; RV64-NEXT: vmv.s.x v0, a0 ; RV64-NEXT: lui a0, %hi(.LCPI7_0) ; RV64-NEXT: addi a0, a0, %lo(.LCPI7_0) -; RV64-NEXT: lui a1, %hi(.LCPI7_1) -; RV64-NEXT: fld ft0, %lo(.LCPI7_1)(a1) ; RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu ; RV64-NEXT: vle64.v v28, (a0) -; RV64-NEXT: vfmv.v.f v26, ft0 +; RV64-NEXT: lui a0, %hi(.LCPI7_1) +; RV64-NEXT: addi a0, a0, %lo(.LCPI7_1) +; RV64-NEXT: vlse64.v v26, (a0), zero ; RV64-NEXT: vsetivli a0, 4, e64,m2,tu,mu ; RV64-NEXT: vrgather.vv v26, v8, v28, v0.t ; RV64-NEXT: vmv2r.v v8, v26 @@ -203,12 +203,12 @@ ; RV32-NEXT: vsetivli a1, 4, e16,m1,ta,mu ; RV32-NEXT: vmv.s.x v25, a0 ; RV32-NEXT: vmv.v.i v28, 0 -; RV32-NEXT: lui a0, %hi(.LCPI8_0) -; RV32-NEXT: fld ft0, %lo(.LCPI8_0)(a0) ; RV32-NEXT: vsetivli a0, 2, e16,m1,tu,mu ; RV32-NEXT: vslideup.vi v28, v25, 1 -; RV32-NEXT: vsetivli a0, 4, e64,m2,ta,mu -; RV32-NEXT: vfmv.v.f v26, ft0 +; RV32-NEXT: lui a0, %hi(.LCPI8_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI8_0) +; RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; RV32-NEXT: vlse64.v v26, (a0), zero ; RV32-NEXT: vsetivli a0, 4, e64,m2,tu,mu ; RV32-NEXT: vrgatherei16.vv v26, v8, v28, v0.t ; RV32-NEXT: vmv2r.v v8, v26 @@ -222,12 +222,12 @@ ; RV64-NEXT: vmv.v.i v28, 0 ; RV64-NEXT: vsetivli a1, 2, e64,m2,tu,mu ; RV64-NEXT: vslideup.vi v28, v26, 1 -; RV64-NEXT: lui a1, %hi(.LCPI8_0) -; RV64-NEXT: fld ft0, %lo(.LCPI8_0)(a1) ; RV64-NEXT: vsetivli a1, 1, e8,m1,ta,mu ; RV64-NEXT: vmv.s.x v0, a0 -; RV64-NEXT: vsetivli a0, 4, e64,m2,ta,mu -; RV64-NEXT: vfmv.v.f v26, ft0 +; RV64-NEXT: lui a0, %hi(.LCPI8_0) +; RV64-NEXT: addi a0, a0, %lo(.LCPI8_0) +; RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; RV64-NEXT: vlse64.v v26, (a0), zero ; RV64-NEXT: vsetivli a0, 4, e64,m2,tu,mu ; RV64-NEXT: vrgather.vv v26, v8, v28, v0.t ; RV64-NEXT: vmv2r.v v8, v26 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll @@ -49,11 +49,11 @@ ; RV32-NEXT: vmv.v.i v28, 0 ; RV32-NEXT: vsetivli a3, 2, e64,m2,tu,mu ; RV32-NEXT: vslideup.vi v28, v26, 0 -; RV32-NEXT: lw a3, 20(a0) +; RV32-NEXT: addi a3, a0, 20 ; RV32-NEXT: vsetivli a4, 4, e32,m1,ta,mu -; RV32-NEXT: lw a4, 16(a0) -; RV32-NEXT: vmv.v.x v26, a3 -; RV32-NEXT: vmv.s.x v26, a4 +; RV32-NEXT: vlse32.v v26, (a3), zero +; RV32-NEXT: lw a3, 16(a0) +; RV32-NEXT: vmv.s.x v26, a3 ; RV32-NEXT: vsetivli a3, 4, e64,m2,tu,mu ; RV32-NEXT: vslideup.vi v28, v26, 2 ; RV32-NEXT: vsetivli a3, 2, e32,m2,ta,mu