diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td --- a/llvm/lib/Target/RISCV/RISCV.td +++ b/llvm/lib/Target/RISCV/RISCV.td @@ -452,6 +452,11 @@ "true", "Has reasonably performant unaligned scalar " "loads and stores">; +def TuneNoOptimizedZeroStrideLoad + : SubtargetFeature<"no-optimized-zero-stride-load", "HasOptimizedZeroStrideLoad", + "false", "Hasn't optimized (perform fewer memory operations)" + "zero-stride vector load">; + def TuneLUIADDIFusion : SubtargetFeature<"lui-addi-fusion", "HasLUIADDIFusion", "true", "Enable LUI+ADDI macrofusion">; diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -1790,6 +1790,10 @@ case RISCVISD::VFMV_S_F_VL: case RISCVISD::VMV_V_X_VL: case RISCVISD::VFMV_V_F_VL: { + // Only if we have optimized zero-stride vector load. + if (!Subtarget->hasOptimizedZeroStrideLoad()) + break; + // Try to match splat of a scalar load to a strided load with stride of x0. bool IsScalarMove = Node->getOpcode() == RISCVISD::VMV_S_X_VL || Node->getOpcode() == RISCVISD::VFMV_S_F_VL; diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -101,6 +101,7 @@ bool HasShortForwardBranchOpt = false; bool HasLUIADDIFusion = false; bool HasForcedAtomics = false; + bool HasOptimizedZeroStrideLoad = true; unsigned XLen = 32; unsigned ZvlLen = 0; MVT XLenVT = MVT::i32; @@ -199,6 +200,7 @@ bool enableUnalignedScalarMem() const { return EnableUnalignedScalarMem; } bool hasLUIADDIFusion() const { return HasLUIADDIFusion; } bool hasForcedAtomics() const { return HasForcedAtomics; } + bool hasOptimizedZeroStrideLoad() const { return HasOptimizedZeroStrideLoad; } MVT getXLenVT() const { return XLenVT; } unsigned getXLen() const { return XLen; } unsigned getFLen() const { diff --git a/llvm/test/CodeGen/RISCV/rvv/vsplats-fp.ll b/llvm/test/CodeGen/RISCV/rvv/vsplats-fp.ll --- a/llvm/test/CodeGen/RISCV/rvv/vsplats-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsplats-fp.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+f,+d,+zfh,+experimental-zvfh,+v -target-abi ilp32d -verify-machineinstrs < %s \ -; RUN: | FileCheck %s +; RUN: | FileCheck %s --check-prefixes=CHECK,OPTIMIZED ; RUN: llc -mtriple=riscv64 -mattr=+f,+d,+zfh,+experimental-zvfh,+v -target-abi lp64d -verify-machineinstrs < %s \ -; RUN: | FileCheck %s +; RUN: | FileCheck %s --check-prefixes=CHECK,OPTIMIZED +; RUN: llc -mtriple=riscv32 -mattr=+f,+d,+zfh,+experimental-zvfh,+v,+no-optimized-zero-stride-load -target-abi ilp32d -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,NOT-OPTIMIZED +; RUN: llc -mtriple=riscv64 -mattr=+f,+d,+zfh,+experimental-zvfh,+v,+no-optimized-zero-stride-load -target-abi lp64d -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,NOT-OPTIMIZED define @vsplat_nxv8f16(half %f) { ; CHECK-LABEL: vsplat_nxv8f16: @@ -72,11 +76,18 @@ ; Test that we fold this to a vlse with 0 stride. define @vsplat_load_nxv8f32(float* %ptr) { -; CHECK-LABEL: vsplat_load_nxv8f32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; CHECK-NEXT: vlse32.v v8, (a0), zero -; CHECK-NEXT: ret +; OPTIMIZED-LABEL: vsplat_load_nxv8f32: +; OPTIMIZED: # %bb.0: +; OPTIMIZED-NEXT: vsetvli a1, zero, e32, m4, ta, ma +; OPTIMIZED-NEXT: vlse32.v v8, (a0), zero +; OPTIMIZED-NEXT: ret +; +; NOT-OPTIMIZED-LABEL: vsplat_load_nxv8f32: +; NOT-OPTIMIZED: # %bb.0: +; NOT-OPTIMIZED-NEXT: flw ft0, 0(a0) +; NOT-OPTIMIZED-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; NOT-OPTIMIZED-NEXT: vfmv.v.f v8, ft0 +; NOT-OPTIMIZED-NEXT: ret %f = load float, float* %ptr %head = insertelement poison, float %f, i32 0 %splat = shufflevector %head, poison, zeroinitializer