diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -787,6 +787,12 @@ access is not supported except by flat and scratch instructions in GFX9-GFX11. + Code that manipulates the stack values in other lanes of a wavefront, + such as by `addrspacecast`ing stack pointers to generic ones and taking offsets + that reach other lanes or by explicitly constructing the scratch buffer descriptor, + triggers undefined behavior when it modifies the scratch values of other lanes. + The compiler may assume that such modifications do not occur. + **Constant 32-bit** *TODO* @@ -806,9 +812,10 @@ it or not). **Buffer Resource** - The buffer resource is an experimental address space that is currently unsupported - in the backend. It exposes a non-integral pointer that will represent a 128-bit - buffer descriptor resource. + The buffer resource pointer, in address space 8, is the newer form + for representing buffer descriptors in AMDGPU IR, replacing their + previous representation as `<4 x i32>`. It is a non-integral pointer + that represents a 128-bit buffer descriptor resource (`V#`). Since, in general, a buffer resource supports complex addressing modes that cannot be easily represented in LLVM (such as implicit swizzled access to structured @@ -819,6 +826,14 @@ Casting a buffer resource to a buffer fat pointer is permitted and adds an offset of 0. + Buffer resources can be created from 64-bit pointers (which should be either + generic or global) using the `llvm.amdgcn.make.buffer.rsrc` intrinsic, which + takes the pointer, which becomes the base of the resource, + the 16-bit stride (and swzizzle control) field stored in bits `63:48` of a `V#`, + the 32-bit NumRecords/extent field (bits `95:64`), and the 32-bit flags field + (bits `127:96`). The specific interpretation of these fields varies by the + target architecture and is detailed in the ISA descriptions. + **Streamout Registers** Dedicated registers used by the GS NGG Streamout Instructions. The register file is modelled as a memory in a distinct address space because it is indexed diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -997,6 +997,16 @@ let TargetPrefix = "amdgcn" in { +def int_amdgcn_make_buffer_rsrc : DefaultAttrsIntrinsic < + [AMDGPUBufferRsrcTy], + [llvm_anyptr_ty, // base + llvm_i16_ty, // stride (and swizzle control) + llvm_i32_ty, // NumRecords / extent + llvm_i32_ty], // flags + // Attributes lifted from ptrmask + some extra argument attributes. + [IntrNoMem, NoCapture>, ReadNone>, + IntrSpeculatable, IntrWillReturn]>; + defset list AMDGPUBufferIntrinsics = { class AMDGPUBufferLoad : DefaultAttrsIntrinsic < diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -54,6 +54,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsAArch64.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsRISCV.h" #include "llvm/IR/IntrinsicsX86.h" #include "llvm/IR/LLVMContext.h" @@ -5608,6 +5609,16 @@ case Intrinsic::strip_invariant_group: case Intrinsic::aarch64_irg: case Intrinsic::aarch64_tagp: + // The amdgcn_make_buffer_rsrc function does not alter the address of the + // input pointer (and thus preserve null-ness for the purposes of escape + // analysis, which is where the MustPreserveNullness flag comes in to play). + // However, it will not necessarily map ptr addrspace(N) null to ptr + // addrspace(8) null, aka the "null descriptor", which has "all loads return + // 0, all stores are dropped" semantics. Given the context of this intrinsic + // list, no one should be relying on such a strict interpretation of + // MustPreserveNullness (and, at time of writing, they are not), but we + // document this fact out of an abundance of caution. + case Intrinsic::amdgcn_make_buffer_rsrc: return true; case Intrinsic::ptrmask: return !MustPreserveNullness; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -102,6 +102,9 @@ bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; + bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizePreloadedArgIntrin( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" @@ -4423,6 +4424,50 @@ return true; } +/// To create a buffer resource from a 64-bit pointer, mask off the upper 32 +/// bits of the pointer and replace them with the stride argument, then +/// merge_values everything together. In the common case of a raw buffer (the +/// stride component is 0), we can just AND off the upper half. +bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { + Register Result = MI.getOperand(0).getReg(); + Register Pointer = MI.getOperand(2).getReg(); + Register Stride = MI.getOperand(3).getReg(); + Register NumRecords = MI.getOperand(4).getReg(); + Register Flags = MI.getOperand(5).getReg(); + + LLT S32 = LLT::scalar(32); + + B.setInsertPt(B.getMBB(), ++B.getInsertPt()); + auto Unmerge = B.buildUnmerge(S32, Pointer); + Register LowHalf = Unmerge.getReg(0); + Register HighHalf = Unmerge.getReg(1); + + auto AndMask = B.buildConstant(S32, 0x0000ffff); + auto Masked = B.buildAnd(S32, HighHalf, AndMask); + + MachineInstrBuilder NewHighHalf = Masked; + std::optional StrideConst = + getIConstantVRegValWithLookThrough(Stride, MRI); + if (!StrideConst || !StrideConst->Value.isZero()) { + MachineInstrBuilder ShiftedStride; + if (StrideConst) { + uint32_t StrideVal = StrideConst->Value.getZExtValue(); + uint32_t ShiftedStrideVal = StrideVal << 16; + ShiftedStride = B.buildConstant(S32, ShiftedStrideVal); + } else { + auto ExtStride = B.buildAnyExt(S32, Stride); + auto ShiftConst = B.buildConstant(S32, 16); + ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst); + } + NewHighHalf = B.buildOr(S32, Masked, ShiftedStride); + } + Register NewHighHalfReg = NewHighHalf.getReg(0); + B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags}); + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -5959,6 +6004,8 @@ return false; } + case Intrinsic::amdgcn_make_buffer_rsrc: + return legalizePointerAsRsrcIntrin(MI, MRI, B); case Intrinsic::amdgcn_kernarg_segment_ptr: if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { // This only makes sense to call in a kernel, so just lower to null. diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -259,6 +259,10 @@ // argument (as would be seen in older buffer intrinsics), does nothing. SDValue bufferRsrcPtrToVector(SDValue MaybePointer, SelectionDAG &DAG) const; + // Wrap a 64-bit pointer into a v4i32 (which is how all SelectionDAG code + // represents ptr addrspace(8)) using the flags specified in the intrinsic. + SDValue lowerPointerAsRsrcIntrin(SDNode *Op, SelectionDAG &DAG) const; + // Handle 8 bit and 16 bit buffer loads SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL, ArrayRef Ops, MemSDNode *M) const; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -15,8 +15,10 @@ #include "AMDGPU.h" #include "AMDGPUInstrInfo.h" #include "AMDGPUTargetMachine.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "llvm/ADT/APInt.h" #include "llvm/ADT/FloatingPointMode.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" @@ -5122,6 +5124,9 @@ case ISD::INTRINSIC_WO_CHAIN: { unsigned IID = cast(N->getOperand(0))->getZExtValue(); switch (IID) { + case Intrinsic::amdgcn_make_buffer_rsrc: + Results.push_back(lowerPointerAsRsrcIntrin(N, DAG)); + return; case Intrinsic::amdgcn_cvt_pkrtz: { SDValue Src0 = N->getOperand(1); SDValue Src1 = N->getOperand(2); @@ -8667,7 +8672,7 @@ Align Alignment) const { const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); SDLoc DL(CombinedOffset); - if (auto C = dyn_cast(CombinedOffset)) { + if (auto *C = dyn_cast(CombinedOffset)) { uint32_t Imm = C->getZExtValue(); uint32_t SOffset, ImmOffset; if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) { @@ -8706,6 +8711,44 @@ return Rsrc; } +// Wrap a global or flat pointer into a buffer intrinsic using the flags +// specified in the intrinsic. +SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op, + SelectionDAG &DAG) const { + SDLoc Loc(Op); + + SDValue Pointer = Op->getOperand(1); + SDValue Stride = Op->getOperand(2); + SDValue NumRecords = Op->getOperand(3); + SDValue Flags = Op->getOperand(4); + + auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32); + SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32); + SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask); + std::optional ConstStride = std::nullopt; + if (auto *ConstNode = dyn_cast(Stride)) + ConstStride = ConstNode->getZExtValue(); + + SDValue NewHighHalf = Masked; + if (!ConstStride || *ConstStride != 0) { + SDValue ShiftedStride; + if (ConstStride) { + ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32); + } else { + SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32); + ShiftedStride = + DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride, + DAG.getShiftAmountConstant(16, MVT::i32, Loc)); + } + NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride); + } + + SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, + NewHighHalf, NumRecords, Flags); + SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc); + return RsrcPtr; +} + // Handle 8 bit and 16 bit buffer loads SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll @@ -0,0 +1,232 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck %s + +define amdgpu_ps ptr addrspace(8) @basic_raw_buffer(ptr inreg %p) { + ; CHECK-LABEL: name: basic_raw_buffer + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5678 + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1234 + ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_AND_B32_]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; CHECK-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 0, i32 1234, i32 5678) + ret ptr addrspace(8) %rsrc +} + +define amdgpu_ps float @read_raw_buffer(ptr addrspace(1) inreg %p) { + ; CHECK-LABEL: name: read_raw_buffer + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[S_AND_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 4, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %p, i16 0, i32 0, i32 0) + %loaded = call float @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0) + ret float %loaded +} + +define amdgpu_ps ptr addrspace(8) @basic_struct_buffer(ptr inreg %p) { + ; CHECK-LABEL: name: basic_struct_buffer + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5678 + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1234 + ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_2]], implicit-def $scc + ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 262144 + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], [[S_MOV_B32_3]], implicit-def $scc + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY2]], implicit $exec + ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; CHECK-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 4, i32 1234, i32 5678) + ret ptr addrspace(8) %rsrc +} + +define amdgpu_ps ptr addrspace(8) @variable_top_half(ptr inreg %p, i32 inreg %numVals, i32 inreg %flags) { + ; CHECK-LABEL: name: variable_top_half + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 262144 + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], [[S_MOV_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; CHECK-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 4, i32 %numVals, i32 %flags) + ret ptr addrspace(8) %rsrc +} + +define amdgpu_ps ptr addrspace(8) @general_case(ptr inreg %p, i16 inreg %stride, i32 inreg %numVals, i32 inreg %flags) { + ; CHECK-LABEL: name: general_case + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 + ; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], [[S_MOV_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; CHECK-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; CHECK-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags) + ret ptr addrspace(8) %rsrc +} + +define amdgpu_ps float @general_case_load(ptr inreg %p, i16 inreg %stride, i32 inreg %numVals, i32 inreg %flags) { + ; CHECK-LABEL: name: general_case_load + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 + ; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], [[S_MOV_B32_1]], implicit-def $scc + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], [[S_LSHL_B32_]], implicit-def $scc + ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[S_OR_B32_]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags) + %value = call float @llvm.amdgcn.struct.ptr.buffer.load(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) + ret float %value +} + +; None of the components are uniform due to the lack of an inreg +define amdgpu_ps float @general_case_load_with_waterfall(ptr %p, i16 %stride, i32 %numVals, i32 %flags) { + ; CHECK-LABEL: name: general_case_load_with_waterfall + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY6]], [[COPY2]], implicit $exec + ; CHECK-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[COPY1]], [[COPY5]], [[V_LSHLREV_B32_e64_]], implicit $exec + ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_AND_OR_B32_e64_]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[V_AND_OR_B32_e64_]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY3]], implicit $exec + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub0_sub1 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE1]].sub2_sub3 + ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY10]], [[COPY8]], implicit $exec + ; CHECK-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY11]], [[COPY9]], implicit $exec + ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def $scc + ; CHECK-NEXT: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[COPY7]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec = S_MOV_B64_term [[S_MOV_B64_]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5: + ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags) + %value = call float @llvm.amdgcn.struct.ptr.buffer.load(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) + ret float %value +} + +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr nocapture readnone, i16, i32, i32) +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) nocapture readnone, i16, i32, i32) +declare float @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) nocapture readonly, i32, i32, i32 immarg) +declare float @llvm.amdgcn.struct.ptr.buffer.load(ptr addrspace(8) nocapture readonly, i32, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll @@ -0,0 +1,163 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck %s + +define amdgpu_ps ptr addrspace(8) @basic_raw_buffer(ptr inreg %p) { + ; CHECK-LABEL: name: basic_raw_buffer + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], killed [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1234 + ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 5678 + ; CHECK-NEXT: $sgpr0 = COPY [[COPY1]] + ; CHECK-NEXT: $sgpr1 = COPY [[S_AND_B32_]] + ; CHECK-NEXT: $sgpr2 = COPY [[S_MOV_B32_1]] + ; CHECK-NEXT: $sgpr3 = COPY [[S_MOV_B32_2]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 0, i32 1234, i32 5678) + ret ptr addrspace(8) %rsrc +} + +define amdgpu_ps float @read_raw_buffer(ptr addrspace(1) inreg %p) { + ; CHECK-LABEL: name: read_raw_buffer + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], killed [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, killed [[S_AND_B32_]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_1]], %subreg.sub3 + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET killed [[REG_SEQUENCE]], [[S_MOV_B32_1]], 4, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %p, i16 0, i32 0, i32 0) + %loaded = call float @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0) + ret float %loaded +} + +define amdgpu_ps ptr addrspace(8) @basic_struct_buffer(ptr inreg %p) { + ; CHECK-LABEL: name: basic_struct_buffer + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], killed [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 262144 + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed [[S_AND_B32_]], killed [[S_MOV_B32_1]], implicit-def dead $scc + ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 1234 + ; CHECK-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 5678 + ; CHECK-NEXT: $sgpr0 = COPY [[COPY1]] + ; CHECK-NEXT: $sgpr1 = COPY [[S_OR_B32_]] + ; CHECK-NEXT: $sgpr2 = COPY [[S_MOV_B32_2]] + ; CHECK-NEXT: $sgpr3 = COPY [[S_MOV_B32_3]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 4, i32 1234, i32 5678) + ret ptr addrspace(8) %rsrc +} + +define amdgpu_ps ptr addrspace(8) @variable_top_half(ptr inreg %p, i32 inreg %numVals, i32 inreg %flags) { + ; CHECK-LABEL: name: variable_top_half + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY2]], killed [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 262144 + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed [[S_AND_B32_]], killed [[S_MOV_B32_1]], implicit-def dead $scc + ; CHECK-NEXT: $sgpr0 = COPY [[COPY3]] + ; CHECK-NEXT: $sgpr1 = COPY [[S_OR_B32_]] + ; CHECK-NEXT: $sgpr2 = COPY [[COPY1]] + ; CHECK-NEXT: $sgpr3 = COPY [[COPY]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 4, i32 %numVals, i32 %flags) + ret ptr addrspace(8) %rsrc +} + +define amdgpu_ps ptr addrspace(8) @general_case(ptr inreg %p, i16 inreg %stride, i32 inreg %numVals, i32 inreg %flags) { + ; CHECK-LABEL: name: general_case + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY3]], killed [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 16, implicit-def dead $scc + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed [[S_AND_B32_]], killed [[S_LSHL_B32_]], implicit-def dead $scc + ; CHECK-NEXT: $sgpr0 = COPY [[COPY4]] + ; CHECK-NEXT: $sgpr1 = COPY [[S_OR_B32_]] + ; CHECK-NEXT: $sgpr2 = COPY [[COPY1]] + ; CHECK-NEXT: $sgpr3 = COPY [[COPY]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags) + ret ptr addrspace(8) %rsrc +} + +define amdgpu_ps float @general_case_load(ptr inreg %p, i16 inreg %stride, i32 inreg %numVals, i32 inreg %flags) { + ; CHECK-LABEL: name: general_case_load + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY3]], killed [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 16, implicit-def dead $scc + ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed [[S_AND_B32_]], killed [[S_LSHL_B32_]], implicit-def dead $scc + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, killed [[S_OR_B32_]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3 + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[COPY5]], killed [[REG_SEQUENCE]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags) + %value = call float @llvm.amdgcn.struct.ptr.buffer.load(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) + ret float %value +} + +; None of the components are uniform due to the lack of an inreg +define amdgpu_ps float @general_case_load_with_waterfall(ptr %p, i16 %stride, i32 %numVals, i32 %flags) { + ; CHECK-LABEL: name: general_case_load_with_waterfall + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY2]], implicit $exec + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 + ; CHECK-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[COPY3]], killed [[S_MOV_B32_]], killed [[V_LSHLREV_B32_e64_]], implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, killed [[V_AND_OR_B32_e64_]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3 + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[COPY5]], killed [[REG_SEQUENCE]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] + ; CHECK-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags) + %value = call float @llvm.amdgcn.struct.ptr.buffer.load(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) + ret float %value +} + +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr nocapture readnone, i16, i32, i32) +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) nocapture readnone, i16, i32, i32) +declare float @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) nocapture readonly, i32, i32, i32 immarg) +declare float @llvm.amdgcn.struct.ptr.buffer.load(ptr addrspace(8) nocapture readonly, i32, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/make-buffer-rsrc-lds-fails.ll b/llvm/test/CodeGen/AMDGPU/make-buffer-rsrc-lds-fails.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/make-buffer-rsrc-lds-fails.ll @@ -0,0 +1,8 @@ +; RUN: not --crash llc -march=amdgcn -mcpu=gfx900 < %s +; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx900 < %s + +define amdgpu_ps ptr addrspace(8) @basic_raw_buffer(ptr addrspace(3) inreg %p) { + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p3(ptr addrspace(3) %p, i16 0, i32 1234, i32 5678) + ret ptr addrspace(8) %rsrc +} +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p3(ptr addrspace(3) nocapture readnone, i16, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll --- a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll +++ b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll @@ -47,6 +47,66 @@ ret void } +define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr noalias %b.flat) { +; SDAG-LABEL: buffers_from_flat_dont_alias: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-NEXT: s_mov_b32 s7, 0 +; SDAG-NEXT: s_mov_b32 s6, 16 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: s_and_b32 s5, s1, 0xffff +; SDAG-NEXT: s_mov_b32 s4, s0 +; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; SDAG-NEXT: s_and_b32 s5, s3, 0xffff +; SDAG-NEXT: s_mov_b32 s4, s2 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0 +; SDAG-NEXT: v_mul_f32_e32 v1, v1, v1 +; SDAG-NEXT: v_mul_f32_e32 v2, v2, v2 +; SDAG-NEXT: v_mul_f32_e32 v3, v3, v3 +; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: buffers_from_flat_dont_alias: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_mov_b32 s7, 0 +; GISEL-NEXT: s_mov_b32 s6, 16 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: s_and_b32 s5, s1, 0xffff +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GISEL-NEXT: s_and_b32 s5, s3, 0xffff +; GISEL-NEXT: s_mov_b32 s4, s2 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0 +; GISEL-NEXT: v_mul_f32_e32 v1, v1, v1 +; GISEL-NEXT: v_mul_f32_e32 v2, v2, v2 +; GISEL-NEXT: v_mul_f32_e32 v3, v3, v3 +; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GISEL-NEXT: s_endpgm + %a = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %a.flat, i16 0, i32 16, i32 0) + %b = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %b.flat, i16 0, i32 16, i32 0) + + %l0 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %a, i32 0, i32 0, i32 0) + %s0 = fmul float %l0, %l0 + call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %s0, ptr addrspace(8) %b, i32 0, i32 0, i32 0) + + %l1 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %a, i32 4, i32 0, i32 0) + %s1 = fmul float %l1, %l1 + call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %s1, ptr addrspace(8) %b, i32 4, i32 0, i32 0) + + %l2 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %a, i32 8, i32 0, i32 0) + %s2 = fmul float %l2, %l2 + call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %s2, ptr addrspace(8) %b, i32 8, i32 0, i32 0) + + %l3 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %a, i32 12, i32 0, i32 0) + %s3 = fmul float %l3, %l3 + call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %s3, ptr addrspace(8) %b, i32 12, i32 0, i32 0) + + ret void +} + define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspace(8) %b) { ; SDAG-LABEL: buffers_might_alias: ; SDAG: ; %bb.0: @@ -151,3 +211,4 @@ declare float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32) declare void @llvm.amdgcn.raw.ptr.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32 immarg) +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr readnone nocapture, i16, i32, i32) diff --git a/llvm/test/Transforms/LICM/AMDGPU/buffer-rsrc-ptrs.ll b/llvm/test/Transforms/LICM/AMDGPU/buffer-rsrc-ptrs.ll --- a/llvm/test/Transforms/LICM/AMDGPU/buffer-rsrc-ptrs.ll +++ b/llvm/test/Transforms/LICM/AMDGPU/buffer-rsrc-ptrs.ll @@ -161,6 +161,45 @@ ret void } +define void @hoistable_buffer_construction_intrinsic(ptr addrspace(1) noalias %p.global, ptr addrspace(1) noalias %q.global, i32 %bound) { +; CHECK-LABEL: define void @hoistable_buffer_construction_intrinsic +; CHECK-SAME: (ptr addrspace(1) noalias [[P_GLOBAL:%.*]], ptr addrspace(1) noalias [[Q_GLOBAL:%.*]], i32 [[BOUND:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) [[P_GLOBAL]], i16 0, i32 0, i32 0) +; CHECK-NEXT: [[Q:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) [[Q_GLOBAL]], i16 0, i32 0, i32 0) +; CHECK-NEXT: [[HOISTABLE:%.*]] = call i32 @llvm.amdgcn.struct.ptr.buffer.load.i32(ptr addrspace(8) [[Q]], i32 0, i32 0, i32 0, i32 0) +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[ORIG:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) [[P]], i32 [[I]], i32 0, i32 0) +; CHECK-NEXT: [[INC:%.*]] = add i32 [[HOISTABLE]], [[ORIG]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[INC]], ptr addrspace(8) [[P]], i32 [[I]], i32 0, i32 0) +; CHECK-NEXT: [[NEXT]] = add i32 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[NEXT]], [[BOUND]] +; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[TAIL:%.*]] +; CHECK: tail: +; CHECK-NEXT: ret void +; +entry: + %p = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %p.global, i16 0, i32 0, i32 0) + %q = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %q.global, i16 0, i32 0, i32 0) + br label %loop +loop: + %i = phi i32 [0, %entry], [%next, %loop] + + %hoistable = call i32 @llvm.amdgcn.struct.ptr.buffer.load.i32(ptr addrspace(8) %q, i32 0, i32 0, i32 0, i32 0) + %orig = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) %p, i32 %i, i32 0, i32 0) + %inc = add i32 %hoistable, %orig + call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %inc, ptr addrspace(8) %p, i32 %i, i32 0, i32 0) + + %next = add i32 %i, 1 + %cond = icmp ult i32 %next, %bound + br i1 %cond, label %loop, label %tail +tail: + ret void +} + + define void @hoistable_buffer_construction_alias_scope(ptr addrspace(1) %p.global, ptr addrspace(1) %q.global, i32 %bound) { ; CHECK-LABEL: define void @hoistable_buffer_construction_alias_scope ; CHECK-SAME: (ptr addrspace(1) [[P_GLOBAL:%.*]], ptr addrspace(1) [[Q_GLOBAL:%.*]], i32 [[BOUND:%.*]]) { @@ -217,6 +256,8 @@ declare i32 @llvm.amdgcn.struct.ptr.buffer.load.i32(ptr addrspace(8) nocapture readonly, i32, i32, i32, i32 immarg) #0 ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write) declare void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32, ptr addrspace(8) nocapture writeonly, i32, i32, i32 immarg) #1 - +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) nocapture readnone, i16, i32, i32) #2 +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) readnone nocapture, i16, i32, i32) attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }