diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -5450,33 +5450,50 @@ } } + auto *Load = cast(Op); SDValue VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second; + SDValue Ptr = Op.getOperand(3); + SDValue Stride = Op.getOperand(4); + SDValue Result, Chain; + + // TODO: We restrict this to unmasked loads currently in consideration of + // the complexity of hanlding all falses masks. + if (IsUnmasked && isNullConstant(Stride) && + !Subtarget.hasOptimizedZeroStrideLoad()) { + MVT ScalarVT = ContainerVT.getVectorElementType(); + SDValue ScalarLoad = + DAG.getExtLoad(ISD::ZEXTLOAD, DL, XLenVT, Load->getChain(), Ptr, + ScalarVT, Load->getMemOperand()); + Chain = ScalarLoad.getValue(1); + Result = lowerScalarSplat(SDValue(), ScalarLoad, VL, ContainerVT, DL, DAG, + Subtarget); + } else { + SDValue IntID = DAG.getTargetConstant( + IsUnmasked ? Intrinsic::riscv_vlse : Intrinsic::riscv_vlse_mask, DL, + XLenVT); - SDValue IntID = DAG.getTargetConstant( - IsUnmasked ? Intrinsic::riscv_vlse : Intrinsic::riscv_vlse_mask, DL, - XLenVT); + SmallVector Ops{Load->getChain(), IntID}; + if (IsUnmasked) + Ops.push_back(DAG.getUNDEF(ContainerVT)); + else + Ops.push_back(PassThru); + Ops.push_back(Ptr); + Ops.push_back(Stride); + if (!IsUnmasked) + Ops.push_back(Mask); + Ops.push_back(VL); + if (!IsUnmasked) { + SDValue Policy = + DAG.getTargetConstant(RISCVII::TAIL_AGNOSTIC, DL, XLenVT); + Ops.push_back(Policy); + } - auto *Load = cast(Op); - SmallVector Ops{Load->getChain(), IntID}; - if (IsUnmasked) - Ops.push_back(DAG.getUNDEF(ContainerVT)); - else - Ops.push_back(PassThru); - Ops.push_back(Op.getOperand(3)); // Ptr - Ops.push_back(Op.getOperand(4)); // Stride - if (!IsUnmasked) - Ops.push_back(Mask); - Ops.push_back(VL); - if (!IsUnmasked) { - SDValue Policy = DAG.getTargetConstant(RISCVII::TAIL_AGNOSTIC, DL, XLenVT); - Ops.push_back(Policy); + SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other}); + Result = + DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, + Load->getMemoryVT(), Load->getMemOperand()); + Chain = Result.getValue(1); } - - SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other}); - SDValue Result = - DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, - Load->getMemoryVT(), Load->getMemOperand()); - SDValue Chain = Result.getValue(1); if (VT.isFixedLengthVector()) Result = convertFromScalableVector(VT, Result, DAG, Subtarget); return DAG.getMergeValues({Result, Chain}, DL); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v -riscv-v-vector-bits-min=256 | FileCheck %s --check-prefixes=CHECK,V ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f -riscv-v-vector-bits-min=256 | FileCheck %s --check-prefixes=CHECK,ZVE32F +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+no-optimized-zero-stride-load -riscv-v-vector-bits-min=256 | FileCheck %s --check-prefixes=CHECK,NOT-OPTIMIZED +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f,+no-optimized-zero-stride-load -riscv-v-vector-bits-min=256 | FileCheck %s --check-prefixes=CHECK,NOT-OPTIMIZED %struct.foo = type { i32, i32, i32, i32 } @@ -176,24 +178,62 @@ define void @gather_zero_stride(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) { ; -; CHECK-LABEL: gather_zero_stride: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: li a2, 0 -; CHECK-NEXT: li a3, 32 -; CHECK-NEXT: li a4, 1024 -; CHECK-NEXT: .LBB3_1: # %vector.body -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-NEXT: vlse8.v v8, (a1), zero -; CHECK-NEXT: add a5, a0, a2 -; CHECK-NEXT: vle8.v v9, (a5) -; CHECK-NEXT: vadd.vv v8, v9, v8 -; CHECK-NEXT: vse8.v v8, (a5) -; CHECK-NEXT: addi a2, a2, 32 -; CHECK-NEXT: addi a1, a1, 160 -; CHECK-NEXT: bne a2, a4, .LBB3_1 -; CHECK-NEXT: # %bb.2: # %for.cond.cleanup -; CHECK-NEXT: ret +; V-LABEL: gather_zero_stride: +; V: # %bb.0: # %entry +; V-NEXT: li a2, 0 +; V-NEXT: li a3, 32 +; V-NEXT: li a4, 1024 +; V-NEXT: .LBB3_1: # %vector.body +; V-NEXT: # =>This Inner Loop Header: Depth=1 +; V-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; V-NEXT: vlse8.v v8, (a1), zero +; V-NEXT: add a5, a0, a2 +; V-NEXT: vle8.v v9, (a5) +; V-NEXT: vadd.vv v8, v9, v8 +; V-NEXT: vse8.v v8, (a5) +; V-NEXT: addi a2, a2, 32 +; V-NEXT: addi a1, a1, 160 +; V-NEXT: bne a2, a4, .LBB3_1 +; V-NEXT: # %bb.2: # %for.cond.cleanup +; V-NEXT: ret +; +; ZVE32F-LABEL: gather_zero_stride: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: li a2, 0 +; ZVE32F-NEXT: li a3, 32 +; ZVE32F-NEXT: li a4, 1024 +; ZVE32F-NEXT: .LBB3_1: # %vector.body +; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 +; ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; ZVE32F-NEXT: vlse8.v v8, (a1), zero +; ZVE32F-NEXT: add a5, a0, a2 +; ZVE32F-NEXT: vle8.v v9, (a5) +; ZVE32F-NEXT: vadd.vv v8, v9, v8 +; ZVE32F-NEXT: vse8.v v8, (a5) +; ZVE32F-NEXT: addi a2, a2, 32 +; ZVE32F-NEXT: addi a1, a1, 160 +; ZVE32F-NEXT: bne a2, a4, .LBB3_1 +; ZVE32F-NEXT: # %bb.2: # %for.cond.cleanup +; ZVE32F-NEXT: ret +; +; NOT-OPTIMIZED-LABEL: gather_zero_stride: +; NOT-OPTIMIZED: # %bb.0: # %entry +; NOT-OPTIMIZED-NEXT: li a2, 0 +; NOT-OPTIMIZED-NEXT: li a3, 32 +; NOT-OPTIMIZED-NEXT: li a4, 1024 +; NOT-OPTIMIZED-NEXT: .LBB3_1: # %vector.body +; NOT-OPTIMIZED-NEXT: # =>This Inner Loop Header: Depth=1 +; NOT-OPTIMIZED-NEXT: lbu a5, 0(a1) +; NOT-OPTIMIZED-NEXT: add a6, a0, a2 +; NOT-OPTIMIZED-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; NOT-OPTIMIZED-NEXT: vle8.v v8, (a6) +; NOT-OPTIMIZED-NEXT: vadd.vx v8, v8, a5 +; NOT-OPTIMIZED-NEXT: vse8.v v8, (a6) +; NOT-OPTIMIZED-NEXT: addi a2, a2, 32 +; NOT-OPTIMIZED-NEXT: addi a1, a1, 160 +; NOT-OPTIMIZED-NEXT: bne a2, a4, .LBB3_1 +; NOT-OPTIMIZED-NEXT: # %bb.2: # %for.cond.cleanup +; NOT-OPTIMIZED-NEXT: ret entry: br label %vector.body