diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4863,6 +4863,7 @@ MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *Intr) const { + const MachineFunction &MF = *MI.getMF(); const unsigned NumDefs = MI.getNumExplicitDefs(); const unsigned ArgOffset = NumDefs + 1; bool IsTFE = NumDefs == 2; @@ -4966,7 +4967,8 @@ IsG16); // See also below in the non-a16 branch - const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && + const bool UseNSA = ST.hasNSAEncoding() && + PackedRegs.size() >= ST.getNSAThreshold(MF) && PackedRegs.size() <= ST.getNSAMaxSize(); if (!UseNSA && PackedRegs.size() > 1) { @@ -5008,7 +5010,8 @@ // TODO: we can actually allow partial NSA where the final register is a // contiguous set of the remaining addresses. // This could help where there are more addresses than supported. - const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && + const bool UseNSA = ST.hasNSAEncoding() && + CorrectedNumVAddrs >= ST.getNSAThreshold(MF) && CorrectedNumVAddrs <= ST.getNSAMaxSize(); if (!UseNSA && Intr->NumVAddrs > 1) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -54,6 +54,10 @@ cl::desc("Enable the use of AA during codegen."), cl::init(true)); +static cl::opt NSAThreshold("amdgpu-nsa-threshold", + cl::desc("Number of addresses from which to enable MIMG NSA."), + cl::init(3), cl::Hidden); + GCNSubtarget::~GCNSubtarget() = default; GCNSubtarget & @@ -950,6 +954,17 @@ : nullptr; } +unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const { + if (NSAThreshold.getNumOccurrences() > 0) + return std::max(NSAThreshold.getValue(), 2u); + + int Value = AMDGPU::getIntegerAttribute(MF.getFunction(), "amdgpu-nsa-threshold", -1); + if (Value > 0) + return std::max(Value, 2); + + return 3; +} + const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) return static_cast(MF.getSubtarget()); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1305,6 +1305,10 @@ // \returns true if it's beneficial on this subtarget for the scheduler to // cluster stores as well as loads. bool shouldClusterStores() const { return getGeneration() >= GFX11; } + + // \returns the number of address arguments from which to enable MIMG NSA + // on supported architectures. + unsigned getNSAThreshold(const MachineFunction &MF) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6522,7 +6522,7 @@ // contiguous set of the remaining addresses. // This could help where there are more addresses than supported. bool UseNSA = ST->hasFeature(AMDGPU::FeatureNSAEncoding) && - VAddrs.size() >= 3 && + VAddrs.size() >= (unsigned)ST->getNSAThreshold(MF) && VAddrs.size() <= (unsigned)ST->getNSAMaxSize(); SDValue VAddr; if (!UseNSA) diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-nsa-threshold.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-nsa-threshold.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-nsa-threshold.ll @@ -0,0 +1,285 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=ATTRIB %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-nsa-threshold=2 -verify-machineinstrs < %s | FileCheck -check-prefix=FORCE-2 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-nsa-threshold=3 -verify-machineinstrs < %s | FileCheck -check-prefix=FORCE-3 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-nsa-threshold=4 -verify-machineinstrs < %s | FileCheck -check-prefix=FORCE-4 %s + +; Note: command line argument should override function attribute. + +define amdgpu_ps <4 x float> @sample_2d_nsa2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %t, float %s) #2 { +; ATTRIB-LABEL: sample_2d_nsa2: +; ATTRIB: ; %bb.0: ; %main_body +; ATTRIB-NEXT: s_mov_b32 s12, exec_lo +; ATTRIB-NEXT: s_wqm_b32 exec_lo, exec_lo +; ATTRIB-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; ATTRIB-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; ATTRIB-NEXT: image_sample v[0:3], [v1, v0], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; ATTRIB-NEXT: s_waitcnt vmcnt(0) +; ATTRIB-NEXT: ; return to shader part epilog +; +; FORCE-2-LABEL: sample_2d_nsa2: +; FORCE-2: ; %bb.0: ; %main_body +; FORCE-2-NEXT: s_mov_b32 s12, exec_lo +; FORCE-2-NEXT: s_wqm_b32 exec_lo, exec_lo +; FORCE-2-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; FORCE-2-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; FORCE-2-NEXT: image_sample v[0:3], [v1, v0], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; FORCE-2-NEXT: s_waitcnt vmcnt(0) +; FORCE-2-NEXT: ; return to shader part epilog +; +; FORCE-3-LABEL: sample_2d_nsa2: +; FORCE-3: ; %bb.0: ; %main_body +; FORCE-3-NEXT: s_mov_b32 s12, exec_lo +; FORCE-3-NEXT: s_wqm_b32 exec_lo, exec_lo +; FORCE-3-NEXT: v_mov_b32_e32 v2, v0 +; FORCE-3-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; FORCE-3-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; FORCE-3-NEXT: s_waitcnt vmcnt(0) +; FORCE-3-NEXT: ; return to shader part epilog +; +; FORCE-4-LABEL: sample_2d_nsa2: +; FORCE-4: ; %bb.0: ; %main_body +; FORCE-4-NEXT: s_mov_b32 s12, exec_lo +; FORCE-4-NEXT: s_wqm_b32 exec_lo, exec_lo +; FORCE-4-NEXT: v_mov_b32_e32 v2, v0 +; FORCE-4-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; FORCE-4-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; FORCE-4-NEXT: s_waitcnt vmcnt(0) +; FORCE-4-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_3d_nsa2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %r, float %s, float %t) #2 { +; ATTRIB-LABEL: sample_3d_nsa2: +; ATTRIB: ; %bb.0: ; %main_body +; ATTRIB-NEXT: s_mov_b32 s12, exec_lo +; ATTRIB-NEXT: s_wqm_b32 exec_lo, exec_lo +; ATTRIB-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; ATTRIB-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; ATTRIB-NEXT: image_sample v[0:3], [v1, v2, v0], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D +; ATTRIB-NEXT: s_waitcnt vmcnt(0) +; ATTRIB-NEXT: ; return to shader part epilog +; +; FORCE-2-LABEL: sample_3d_nsa2: +; FORCE-2: ; %bb.0: ; %main_body +; FORCE-2-NEXT: s_mov_b32 s12, exec_lo +; FORCE-2-NEXT: s_wqm_b32 exec_lo, exec_lo +; FORCE-2-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; FORCE-2-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; FORCE-2-NEXT: image_sample v[0:3], [v1, v2, v0], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D +; FORCE-2-NEXT: s_waitcnt vmcnt(0) +; FORCE-2-NEXT: ; return to shader part epilog +; +; FORCE-3-LABEL: sample_3d_nsa2: +; FORCE-3: ; %bb.0: ; %main_body +; FORCE-3-NEXT: s_mov_b32 s12, exec_lo +; FORCE-3-NEXT: s_wqm_b32 exec_lo, exec_lo +; FORCE-3-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; FORCE-3-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; FORCE-3-NEXT: image_sample v[0:3], [v1, v2, v0], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D +; FORCE-3-NEXT: s_waitcnt vmcnt(0) +; FORCE-3-NEXT: ; return to shader part epilog +; +; FORCE-4-LABEL: sample_3d_nsa2: +; FORCE-4: ; %bb.0: ; %main_body +; FORCE-4-NEXT: s_mov_b32 s12, exec_lo +; FORCE-4-NEXT: s_wqm_b32 exec_lo, exec_lo +; FORCE-4-NEXT: v_mov_b32_e32 v3, v0 +; FORCE-4-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; FORCE-4-NEXT: image_sample v[0:3], v[1:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D +; FORCE-4-NEXT: s_waitcnt vmcnt(0) +; FORCE-4-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_2d_nsa3(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %t, float %s) #3 { +; ATTRIB-LABEL: sample_2d_nsa3: +; ATTRIB: ; %bb.0: ; %main_body +; ATTRIB-NEXT: s_mov_b32 s12, exec_lo +; ATTRIB-NEXT: s_wqm_b32 exec_lo, exec_lo +; ATTRIB-NEXT: v_mov_b32_e32 v2, v0 +; ATTRIB-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; ATTRIB-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; ATTRIB-NEXT: s_waitcnt vmcnt(0) +; ATTRIB-NEXT: ; return to shader part epilog +; +; FORCE-2-LABEL: sample_2d_nsa3: +; FORCE-2: ; %bb.0: ; %main_body +; FORCE-2-NEXT: s_mov_b32 s12, exec_lo +; FORCE-2-NEXT: s_wqm_b32 exec_lo, exec_lo +; FORCE-2-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; FORCE-2-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; FORCE-2-NEXT: image_sample v[0:3], [v1, v0], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; FORCE-2-NEXT: s_waitcnt vmcnt(0) +; FORCE-2-NEXT: ; return to shader part epilog +; +; FORCE-3-LABEL: sample_2d_nsa3: +; FORCE-3: ; %bb.0: ; %main_body +; FORCE-3-NEXT: s_mov_b32 s12, exec_lo +; FORCE-3-NEXT: s_wqm_b32 exec_lo, exec_lo +; FORCE-3-NEXT: v_mov_b32_e32 v2, v0 +; FORCE-3-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; FORCE-3-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; FORCE-3-NEXT: s_waitcnt vmcnt(0) +; FORCE-3-NEXT: ; return to shader part epilog +; +; FORCE-4-LABEL: sample_2d_nsa3: +; FORCE-4: ; %bb.0: ; %main_body +; FORCE-4-NEXT: s_mov_b32 s12, exec_lo +; FORCE-4-NEXT: s_wqm_b32 exec_lo, exec_lo +; FORCE-4-NEXT: v_mov_b32_e32 v2, v0 +; FORCE-4-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; FORCE-4-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; FORCE-4-NEXT: s_waitcnt vmcnt(0) +; FORCE-4-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_3d_nsa3(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %r, float %s, float %t) #3 { +; ATTRIB-LABEL: sample_3d_nsa3: +; ATTRIB: ; %bb.0: ; %main_body +; ATTRIB-NEXT: s_mov_b32 s12, exec_lo +; ATTRIB-NEXT: s_wqm_b32 exec_lo, exec_lo +; ATTRIB-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; ATTRIB-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; ATTRIB-NEXT: image_sample v[0:3], [v1, v2, v0], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D +; ATTRIB-NEXT: s_waitcnt vmcnt(0) +; ATTRIB-NEXT: ; return to shader part epilog +; +; FORCE-2-LABEL: sample_3d_nsa3: +; FORCE-2: ; %bb.0: ; %main_body +; FORCE-2-NEXT: s_mov_b32 s12, exec_lo +; FORCE-2-NEXT: s_wqm_b32 exec_lo, exec_lo +; FORCE-2-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; FORCE-2-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; FORCE-2-NEXT: image_sample v[0:3], [v1, v2, v0], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D +; FORCE-2-NEXT: s_waitcnt vmcnt(0) +; FORCE-2-NEXT: ; return to shader part epilog +; +; FORCE-3-LABEL: sample_3d_nsa3: +; FORCE-3: ; %bb.0: ; %main_body +; FORCE-3-NEXT: s_mov_b32 s12, exec_lo +; FORCE-3-NEXT: s_wqm_b32 exec_lo, exec_lo +; FORCE-3-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; FORCE-3-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; FORCE-3-NEXT: image_sample v[0:3], [v1, v2, v0], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D +; FORCE-3-NEXT: s_waitcnt vmcnt(0) +; FORCE-3-NEXT: ; return to shader part epilog +; +; FORCE-4-LABEL: sample_3d_nsa3: +; FORCE-4: ; %bb.0: ; %main_body +; FORCE-4-NEXT: s_mov_b32 s12, exec_lo +; FORCE-4-NEXT: s_wqm_b32 exec_lo, exec_lo +; FORCE-4-NEXT: v_mov_b32_e32 v3, v0 +; FORCE-4-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; FORCE-4-NEXT: image_sample v[0:3], v[1:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D +; FORCE-4-NEXT: s_waitcnt vmcnt(0) +; FORCE-4-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_2d_nsa4(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %t, float %s) #4 { +; ATTRIB-LABEL: sample_2d_nsa4: +; ATTRIB: ; %bb.0: ; %main_body +; ATTRIB-NEXT: s_mov_b32 s12, exec_lo +; ATTRIB-NEXT: s_wqm_b32 exec_lo, exec_lo +; ATTRIB-NEXT: v_mov_b32_e32 v2, v0 +; ATTRIB-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; ATTRIB-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; ATTRIB-NEXT: s_waitcnt vmcnt(0) +; ATTRIB-NEXT: ; return to shader part epilog +; +; FORCE-2-LABEL: sample_2d_nsa4: +; FORCE-2: ; %bb.0: ; %main_body +; FORCE-2-NEXT: s_mov_b32 s12, exec_lo +; FORCE-2-NEXT: s_wqm_b32 exec_lo, exec_lo +; FORCE-2-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; FORCE-2-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; FORCE-2-NEXT: image_sample v[0:3], [v1, v0], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; FORCE-2-NEXT: s_waitcnt vmcnt(0) +; FORCE-2-NEXT: ; return to shader part epilog +; +; FORCE-3-LABEL: sample_2d_nsa4: +; FORCE-3: ; %bb.0: ; %main_body +; FORCE-3-NEXT: s_mov_b32 s12, exec_lo +; FORCE-3-NEXT: s_wqm_b32 exec_lo, exec_lo +; FORCE-3-NEXT: v_mov_b32_e32 v2, v0 +; FORCE-3-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; FORCE-3-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; FORCE-3-NEXT: s_waitcnt vmcnt(0) +; FORCE-3-NEXT: ; return to shader part epilog +; +; FORCE-4-LABEL: sample_2d_nsa4: +; FORCE-4: ; %bb.0: ; %main_body +; FORCE-4-NEXT: s_mov_b32 s12, exec_lo +; FORCE-4-NEXT: s_wqm_b32 exec_lo, exec_lo +; FORCE-4-NEXT: v_mov_b32_e32 v2, v0 +; FORCE-4-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; FORCE-4-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; FORCE-4-NEXT: s_waitcnt vmcnt(0) +; FORCE-4-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +define amdgpu_ps <4 x float> @sample_3d_nsa4(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %r, float %s, float %t) #4 { +; ATTRIB-LABEL: sample_3d_nsa4: +; ATTRIB: ; %bb.0: ; %main_body +; ATTRIB-NEXT: s_mov_b32 s12, exec_lo +; ATTRIB-NEXT: s_wqm_b32 exec_lo, exec_lo +; ATTRIB-NEXT: v_mov_b32_e32 v3, v0 +; ATTRIB-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; ATTRIB-NEXT: image_sample v[0:3], v[1:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D +; ATTRIB-NEXT: s_waitcnt vmcnt(0) +; ATTRIB-NEXT: ; return to shader part epilog +; +; FORCE-2-LABEL: sample_3d_nsa4: +; FORCE-2: ; %bb.0: ; %main_body +; FORCE-2-NEXT: s_mov_b32 s12, exec_lo +; FORCE-2-NEXT: s_wqm_b32 exec_lo, exec_lo +; FORCE-2-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; FORCE-2-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; FORCE-2-NEXT: image_sample v[0:3], [v1, v2, v0], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D +; FORCE-2-NEXT: s_waitcnt vmcnt(0) +; FORCE-2-NEXT: ; return to shader part epilog +; +; FORCE-3-LABEL: sample_3d_nsa4: +; FORCE-3: ; %bb.0: ; %main_body +; FORCE-3-NEXT: s_mov_b32 s12, exec_lo +; FORCE-3-NEXT: s_wqm_b32 exec_lo, exec_lo +; FORCE-3-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; FORCE-3-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; FORCE-3-NEXT: image_sample v[0:3], [v1, v2, v0], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D +; FORCE-3-NEXT: s_waitcnt vmcnt(0) +; FORCE-3-NEXT: ; return to shader part epilog +; +; FORCE-4-LABEL: sample_3d_nsa4: +; FORCE-4: ; %bb.0: ; %main_body +; FORCE-4-NEXT: s_mov_b32 s12, exec_lo +; FORCE-4-NEXT: s_wqm_b32 exec_lo, exec_lo +; FORCE-4-NEXT: v_mov_b32_e32 v3, v0 +; FORCE-4-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; FORCE-4-NEXT: image_sample v[0:3], v[1:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D +; FORCE-4-NEXT: s_waitcnt vmcnt(0) +; FORCE-4-NEXT: ; return to shader part epilog +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readonly "amdgpu-nsa-threshold"="2" } +attributes #3 = { nounwind readonly "amdgpu-nsa-threshold"="3" } +attributes #4 = { nounwind readonly "amdgpu-nsa-threshold"="4" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll @@ -1,15 +1,20 @@ ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-nsa-encoding -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX10-NONSA %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,GFX1010-NSA %s -; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,GFX1030-NSA %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -amdgpu-nsa-threshold=32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX10-NONSA %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -amdgpu-nsa-threshold=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T2 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T3,GFX1010-NSA %s +; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T3,GFX1030-NSA %s ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-nsa-encoding -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX11-NONSA %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,GFX11-NSA %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-nsa-threshold=32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX11-NONSA %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-nsa-threshold=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T2 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T3,GFX11-NSA %s +; Default NSA threshold is 3 addresses ; GCN-LABEL: {{^}}sample_2d: -; -; TODO: use NSA here -; GCN: v_mov_b32_e32 v2, v0 -; -; GCN: image_sample v[0:3], v[1:2], +; NONSA: v_mov_b32_e32 v2, v0 +; NONSA: image_sample v[0:3], v[1:2], +; NSA-T2: image_sample v[0:3], [v1, v0], +; NSA-T3: v_mov_b32_e32 v2, v0 +; NSA-T3: image_sample v[0:3], v[1:2], define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %t, float %s) { main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)