diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -56,6 +56,7 @@ FunctionPass *createAMDGPURewriteOutArgumentsPass(); ModulePass * createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr); +ModulePass *createAMDGPULowerBufferFatPointersPass(); FunctionPass *createSIModeRegisterPass(); FunctionPass *createGCNPreRAOptimizationsPass(); @@ -124,6 +125,18 @@ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); }; +void initializeAMDGPULowerBufferFatPointersPass(PassRegistry &); +extern char &AMDGPULowerBufferFatPointersID; + +struct AMDGPULowerBufferFatPointersPass + : PassInfoMixin { + AMDGPULowerBufferFatPointersPass(const TargetMachine &TM) : TM(TM) {} + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + +private: + const TargetMachine &TM; +}; + void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &); extern char &AMDGPURewriteOutArgumentsID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -0,0 +1,1993 @@ +//===-- AMDGPULowerBufferFatPointers.cpp ---------------------------=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass lowers operations on buffer fat pointers (addrspace 7) to +// operations on buffer resources (addrspace 8) and is needed for correct +// codegen. +// +// # Background +// +// Address space 7 (the buffer fat pointer) is a 160-bit pointer that consists +// of a 128-bit buffer descriptor and a 32-bit offset into that descriptor. +// The buffer resource part needs to be it needs to be a "raw" buffer resource +// (it must have a stride of 0 and bounds checks must be in raw buffer mode +// or disabled). +// +// When these requirements are met, a buffer resource can be treated as a +// typical (though quite wide) pointer that follows typical LLVM pointer +// semantics. This allows the frontend to reason about such buffers (which are +// often encounterd in the context of SPIR-V kernels). +// +// However, because of their non-power-of-2 size, these fat pointers cannot be +// present during translation to MIR (though this restriction may be lifted +// during the transition to GlobalISel). Therefore, this pass is needed in order +// to correctly implement these fat pointers. +// +// The resource intrinsics take the resource part (the address space 8 pointer) +// and the offset part (the 32-bit integer) as separate arguments. In addition, +// many users of these buffers manipulate the offset while leaving the resource +// part alone. For these reasons, we want to typically separate the resource +// and offset parts into separate variables, but combine them together when +// encountering cases where this is required, such as by inserting these values +// into aggretates ormoving them to memory. +// +// Therefore, at a high level, `ptr addrspace(7) %x` becomes `ptr addrspace(8) +// %x.rsrc` and `i32 %x.off`, which will be combined into `{ptr addrspace(8), +// i32} %x = {%x.rsrc, %x.off}` if needed. Similarly, `vector` becomes +// `{vector, vector}` and its component parts. +// +// # Implementation +// +// This pass proceeds in three main phases: +// +// ## Rewriting loads and stores of p7 +// +// The first phase is to rewrite away all loads and stors of `ptr addrspace(7)`, +// including aggregates containing such pointers, to ones that use `i160`. This +// is handled by `StoreFatPtrsAsIntsVisitor` , which visits loads, stores, and +// allocas and, if the loaded or stored type contains `ptr addrspace(7)`, +// rewrites that type to one where the p7s are replaced by i160s, copying other +// parts of aggregates as needed. In the case of a store, each pointer is +// `ptrtoint`d to i160 before storing, and load integers are `inttoptr`d back. +// This same transformation is applied to vectors of pointers. +// +// Such a transformation allows the later phases of the pass to not need +// to handle buffer fat pointers moving to and from memory, where we load +// have to handle the incompatibility between a `{Nxp8, Nxi32}` representation +// and `Nxi60` directly. Instead, that transposing action (where the vectors +// of resources and vectors of offsets are concatentated before being stored to +// memory) are handled through implementing `inttoptr` and `ptrtoint` only. +// +// Atomics operations on `ptr addrspace(7)` values are not suppported, as the +// hardware does not include a 160-bit atomic. +// +// ## Type remapping +// +// We use a `ValueMapper` to mangle uses of [vectors of] buffer fat pointers +// to the corresponding struct type, which has a resource part and an offset +// part. +// +// This uses a `BufferFatPtrToStructTypeMap` and a `FatPtrConstMaterializer` +// to, usually by way of `setType`ing values. Constants are handled here +// because there isn't a good way to fix them up later. +// +// This has the downside of leaving the IR in an invalid state (for example, +// the instruction `getelementptr {ptr addrspace(8), i32} %p, ...` will exist), +// but all such invalid states will be resolved by the third phase. +// +// Functions that don't take buffer fat pointers are modified in place. Those +// that do take such pointers have their basic blocks moved to a new function +// with arguments that are {ptr addrspace(8), i32} arguments and return values. +// This phase also records intrinsics so that they can be remangled or deleted +// later. +// +// +// ## Splitting pointer structs +// +// The meat of this pass consists of defining semantics for operations that +// produce or consume [vectors of] buffer fat pointers in terms of their +// resource and offset parts. This is accomplished throgh the `SplitPtrStructs` +// visitor. +// +// In the first pass through each function that is being lowered, the splitter +// inserts new instructions to implement the split-structures behavior, which is +// needed for correctness and performance. It records a list of "split users", +// instructions that are being replaced by operations on the resource and offset +// parts. +// +// Split users do not necessarily need to produce parts themselves ( +// a `load float, ptr addrspace(7)` does not, for example), but, if they do not +// generate fat buffer pointers, they must RAUW in their replacement +// instructions during the initial visit. +// +// When these new instructions are created, they use the split parts recorded +// for their initial arguments in order to generate their replacements, creating +// a parallel set of instructions that does not refer to the original fat +// pointer values but instead to their resource and offset components. +// +// Instructions, such as `extractvalue`, that produce buffer fat pointers from +// sources that do not have split parts, have such parts generated using +// `extractvalue`. This is also the initial handling of PHI nodes, which +// are then cleaned up. +// +// ### Conditionals +// +// PHI nodes are initially given resource parts via `extractvalue`. However, +// this is not an efficient rewrite of such nodes, as, in most cases, the +// resource part in a conditional or loop remains constant throughout the loop +// and only the offset varies. Failing to optimize away these constant resources +// would cause additional registers to be sent around loops and might lead to +// waterfall loops being generated for buffer operations due to the +// "non-uniform" resource argument. +// +// Therefore, after all instructions have been visited, the pointer splitter +// post-processes all encountered conditionals. Given a PHI node or select, +// getPossibleRsrcRoots() collects all values that the resource parts of that +// conditional's input could come from as well as collecting all conditional +// instructions encountered during the search. If, after filtering out the +// initial node itself, the set of encountered conditionals is a subset of the +// potential roots and there is a single potential resource that isn't in the +// conditional set, that value is the only possible value the resource argument +// could have throughout the control flow. +// +// If that condition is met, then a PHI node can have its resource part changed +// to the singleton value and then be replaced by a PHI on the offsets. +// Otherwise, each PHI node is split into two, one for the resource part and one +// for the offset part, which replace the temporary `extractvalue` instructions +// that were added during the first pass. +// +// Similar logic applies to `select`, where +// `%z = select i1 %cond, %cond, ptr addrspace(7) %x, ptr addrspace(7) %y` +// can be split into `%z.rsrc = %x.rsrc` and +// `%z.off = select i1 %cond, ptr i32 %x.off, i32 %y.off` +// if both `%x` and `%y` have the same resource part, but two `select` +// operations will be needed if they do not. +// +// ### Final processing +// +// After conditionals have been cleaned up, the IR for each function is +// rewritten to remove all the old instructions that have been split up. +// +// Any instruction that used to produce a buffer fat pointer (and therefore now +// produces a resource-and-offset struct after type remapping) is +// replaced as follows: +// 1. All debug value annotations are cloned to reflect that the resource part +// and offset parts are computed separately and constitute different +// fragments of the underlying source language variable. +// 2. All uses that were themselves split are replaced by a `poison` of the +// struct type, as they will themselves be erased soon. This rule, combined +// with debug handling, should leave the use lists of split instructions +// empty in almost all cases. +// 3. If a user of the original struct-valued result remains, the structure +// needed for the new types to work is constructed out of the newly-defined +// parts, and the original instruction is replaced by this structure +// before being erased. Instructions requiring this construction include +// `ret` and `insertvalue`. +// +// # Consequences +// +// This pass does not alter the CFG. +// +// Alias analysis information will become coarser, as the LLVM alias analyzer +// cannot handle the buffer intrinsics. Specifically, while we can determine +// that the following two loads do not alias: +// ``` +// %y = getelementptr i32, ptr addrspace(7) %x, i32 1 +// %a = load i32, ptr addrspace(7) %x +// %b = load i32, ptr addrspace(7) %y +// ``` +// we cannot (except through some code that runs during scheduling) determine +// that the rewritten loads below do not alias. +// ``` +// %y.off = add i32 %x.off, 1 +// %a = call @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) %x.rsrc, i32 +// %x.off, ...) +// %b = call @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) +// %x.rsrc, i32 %y.off, ...) +// ``` +// However, existing alias information is preserved. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" +#include "GCNSubtarget.h" +#include "llvm/ADT/SetOperations.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/AttributeMask.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Operator.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/AtomicOrdering.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/ValueMapper.h" + +#define DEBUG_TYPE "amdgpu-lower-buffer-fat-pointers" + +using namespace llvm; + +static constexpr unsigned BufferOffsetWidth = 32; + +namespace { +/// Recursively replace instances of ptr addrspace(7) and vector with some other type as defined by the relevant subclass. +class BufferFatPtrTypeLoweringBase : public ValueMapTypeRemapper { + DenseMap Map; + + Type *remapTypeImpl(Type *Ty, SmallPtrSetImpl &Seen); + +protected: + virtual Type *remapScalar(PointerType *PT) = 0; + virtual Type *remapVector(VectorType *VT) = 0; + + const DataLayout &DL; + +public: + BufferFatPtrTypeLoweringBase(const DataLayout &DL) : DL(DL) {} + Type *remapType(Type *SrcTy) override; + void clear() { Map.clear(); } +}; + +/// Remap ptr addrspace(7) to i160 and vector to +/// vector in order to correctly handling loading/storing these values +/// from memory. +class BufferFatPtrToIntTypeMap : public BufferFatPtrTypeLoweringBase { + using BufferFatPtrTypeLoweringBase::BufferFatPtrTypeLoweringBase; + +protected: + Type *remapScalar(PointerType *PT) override { return DL.getIntPtrType(PT); } + Type *remapVector(VectorType *VT) override { return DL.getIntPtrType(VT); } +}; + +/// Remap ptr addrspace(7) to {ptr addrspace(8), i32} (the resource and offset +/// parts of the pointer) so that we can easily rewrite operations on these +/// values that aren't loading them from or storing them to memory. +class BufferFatPtrToStructTypeMap : public BufferFatPtrTypeLoweringBase { + using BufferFatPtrTypeLoweringBase::BufferFatPtrTypeLoweringBase; + +protected: + Type *remapScalar(PointerType *PT) override; + Type *remapVector(VectorType *VT) override; +}; +} // namespace + +// This code is adapted from the type remapper in lib/Linker/IRMover.cpp +Type *BufferFatPtrTypeLoweringBase::remapTypeImpl( + Type *Ty, SmallPtrSetImpl &Seen) { + Type **Entry = &Map[Ty]; + if (*Entry) + return *Entry; + if (auto *PT = dyn_cast(Ty)) { + if (PT->getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER) { + return *Entry = remapScalar(PT); + } + } + if (auto *VT = dyn_cast(Ty)) { + auto *PT = dyn_cast(VT->getElementType()); + if (PT && PT->getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER) { + return *Entry = remapVector(VT); + } + return *Entry = Ty; + } + bool IsUniqued = !isa(Ty) || cast(Ty)->isLiteral(); + // Base case for ints, floats, opaque pointers, and so on, which don't + // require recursion. + if (Ty->getNumContainedTypes() == 0 && IsUniqued) + return *Entry = Ty; + if (!IsUniqued) { + // Create a dummy type for recursion purposes. + if (!Seen.insert(cast(Ty)).second) { + StructType *Placeholder = StructType::create(Ty->getContext()); + return *Entry = Placeholder; + } + } + bool Changed = false; + SmallVector ElementTypes; + ElementTypes.reserve(Ty->getNumContainedTypes()); + for (unsigned int I = 0, E = Ty->getNumContainedTypes(); I < E; ++I) { + Type *OldElem = Ty->getContainedType(I); + Type *NewElem = remapTypeImpl(OldElem, Seen); + ElementTypes.push_back(NewElem); + Changed |= (OldElem != NewElem); + } + if (!Changed) { + return *Entry = Ty; + } + if (auto *ArrTy = dyn_cast(Ty)) + return *Entry = ArrayType::get(ElementTypes[0], ArrTy->getNumElements()); + if (auto *FnTy = dyn_cast(Ty)) + return *Entry = FunctionType::get(ElementTypes[0], + ArrayRef(ElementTypes).slice(1), + FnTy->isVarArg()); + if (auto *STy = dyn_cast(Ty)) { + // Genuine opaque types don't have a remapping. + if (STy->isOpaque()) + return *Entry = Ty; + bool IsPacked = STy->isPacked(); + if (IsUniqued) + return *Entry = StructType::get(Ty->getContext(), ElementTypes, IsPacked); + SmallString<16> Name(STy->getName()); + STy->setName(""); + Type **RecursionEntry = &Map[Ty]; + if (*RecursionEntry) { + auto *Placeholder = cast(*RecursionEntry); + Placeholder->setBody(ElementTypes, IsPacked); + Placeholder->setName(Name); + return *Entry = Placeholder; + } + return *Entry = StructType::create(Ty->getContext(), ElementTypes, Name, + IsPacked); + } + llvm_unreachable("Unknown type of type that contains elements"); +} + +Type *BufferFatPtrTypeLoweringBase::remapType(Type *SrcTy) { + SmallPtrSet Visited; + return remapTypeImpl(SrcTy, Visited); +} + +Type *BufferFatPtrToStructTypeMap::remapScalar(PointerType *PT) { + LLVMContext &Ctx = PT->getContext(); + return StructType::get(PointerType::get(Ctx, AMDGPUAS::BUFFER_RESOURCE), + IntegerType::get(Ctx, BufferOffsetWidth)); +} + +Type *BufferFatPtrToStructTypeMap::remapVector(VectorType *VT) { + ElementCount EC = VT->getElementCount(); + LLVMContext &Ctx = VT->getContext(); + Type *RsrcVec = + VectorType::get(PointerType::get(Ctx, AMDGPUAS::BUFFER_RESOURCE), EC); + Type *OffVec = VectorType::get(IntegerType::get(Ctx, BufferOffsetWidth), EC); + return StructType::get(RsrcVec, OffVec); +} + +static bool isBufferFatPtrOrVector(Type *Ty) { + if (auto *PT = dyn_cast(Ty->getScalarType())) + return PT->getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER; + return false; +} + +// True if the type is {ptr addrspace(8), i32} or a struct containing vectors of +// those types. Used to quickly skip instructions we don't need to process. +static bool isSplitFatPtr(Type *Ty) { + auto *ST = dyn_cast(Ty); + if (!ST) + return false; + if (!ST->isLiteral() || ST->getNumElements() != 2) + return false; + auto *MaybeRsrc = + dyn_cast(ST->getElementType(0)->getScalarType()); + auto *MaybeOff = + dyn_cast(ST->getElementType(1)->getScalarType()); + return MaybeRsrc && MaybeOff && + MaybeRsrc->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE && + MaybeOff->getBitWidth() == BufferOffsetWidth; +} + +// True if the result type or any argument types are buffer fat pointers. +static bool isBufferFatPtrConst(Constant *C) { + Type *T = C->getType(); + return isBufferFatPtrOrVector(T) || + llvm::any_of(C->operands(), [](const Use &U) { + return isBufferFatPtrOrVector(U.get()->getType()); + }); +} + +namespace { +/// Convert [vectors of] buffer fat pointers to integers when they are read from +/// or stored to memory. This ensures that these pointers will have the same +/// memory layout as before they are lowered, even though they will no longer +/// have their previous layout in registers/in the program (they'll be broken +/// down into resource and offset parts). This has the downside of imposing +/// marshalling costs when reading or storing these values, but since placing +/// such pointers into memory is an uncommon operation at best, we feel that +/// this cost is acceptable for better performance in the common case. +class StoreFatPtrsAsIntsVisitor + : public InstVisitor { + BufferFatPtrToIntTypeMap *TypeMap; + + ValueToValueMapTy ConvertedForStore; + + IRBuilder<> IRB; + + // Convert all the buffer fat pointers within the input value to inttegers + // so that it can be stored in memory. + Value *fatPtrsToInts(Value *V, Type *From, Type *To, const Twine &Name); + // Convert all the i160s that need to be buffer fat pointers (as specified) + // by the To type) into those pointers to preserve the semantics of the rest + // of the program. + Value *intsToFatPtrs(Value *V, Type *From, Type *To, const Twine &Name); + +public: + StoreFatPtrsAsIntsVisitor(BufferFatPtrToIntTypeMap *TypeMap, LLVMContext &Ctx) + : TypeMap(TypeMap), IRB(Ctx) {} + bool processFunction(Function &F); + + bool visitInstruction(Instruction &I) { return false; } + bool visitAllocaInst(AllocaInst &I); + bool visitLoadInst(LoadInst &LI); + bool visitStoreInst(StoreInst &SI); + bool visitGetElementPtrInst(GetElementPtrInst &I); +}; +} // namespace + +Value *StoreFatPtrsAsIntsVisitor::fatPtrsToInts(Value *V, Type *From, Type *To, + const Twine &Name) { + if (From == To) + return V; + ValueToValueMapTy::iterator Find = ConvertedForStore.find(V); + if (Find != ConvertedForStore.end()) + return Find->second; + if (isBufferFatPtrOrVector(From)) { + Value *Cast = IRB.CreatePtrToInt(V, To, Name + ".int"); + ConvertedForStore[V] = Cast; + return Cast; + } + if (From->getNumContainedTypes() == 0) + return V; + // Structs, arrays, and other compound types. + Value *Ret = PoisonValue::get(To); + if (auto *AT = dyn_cast(From)) { + Type *FromPart = AT->getArrayElementType(); + Type *ToPart = cast(To)->getElementType(); + for (uint64_t I = 0, E = AT->getArrayNumElements(); I < E; ++I) { + Value *Field = IRB.CreateExtractValue(V, I); + Value *NewField = + fatPtrsToInts(Field, FromPart, ToPart, Name + "." + Twine(I)); + Ret = IRB.CreateInsertValue(Ret, NewField, I); + } + } else { + for (auto [Idx, FromPart, ToPart] : + enumerate(From->subtypes(), To->subtypes())) { + Value *Field = IRB.CreateExtractValue(V, Idx); + Value *NewField = + fatPtrsToInts(Field, FromPart, ToPart, Name + "." + Twine(Idx)); + Ret = IRB.CreateInsertValue(Ret, NewField, Idx); + } + } + ConvertedForStore[V] = Ret; + return Ret; +} + +Value *StoreFatPtrsAsIntsVisitor::intsToFatPtrs(Value *V, Type *From, Type *To, + const Twine &Name) { + if (From == To) + return V; + if (isBufferFatPtrOrVector(To)) { + Value *Cast = IRB.CreateIntToPtr(V, To, Name + ".ptr"); + return Cast; + } + if (From->getNumContainedTypes() == 0) + return V; + // Structs, arrays, and other compound types. + Value *Ret = PoisonValue::get(To); + if (auto *AT = dyn_cast(From)) { + Type *FromPart = AT->getArrayElementType(); + Type *ToPart = cast(To)->getElementType(); + for (uint64_t I = 0, E = AT->getArrayNumElements(); I < E; ++I) { + Value *Field = IRB.CreateExtractValue(V, I); + Value *NewField = + intsToFatPtrs(Field, FromPart, ToPart, Name + "." + Twine(I)); + Ret = IRB.CreateInsertValue(Ret, NewField, I); + } + } else { + for (auto [Idx, FromPart, ToPart] : + enumerate(From->subtypes(), To->subtypes())) { + Value *Field = IRB.CreateExtractValue(V, Idx); + Value *NewField = + intsToFatPtrs(Field, FromPart, ToPart, Name + "." + Twine(Idx)); + Ret = IRB.CreateInsertValue(Ret, NewField, Idx); + } + } + return Ret; +} + +bool StoreFatPtrsAsIntsVisitor::processFunction(Function &F) { + bool Changed = false; + // The visitors will mutate GEPs and allocas, but will push loads and stores + // to the worklist to avoid invalidation. + for (Instruction &I : make_early_inc_range(instructions(F))) { + Changed |= visit(I); + } + ConvertedForStore.clear(); + return Changed; +} + +bool StoreFatPtrsAsIntsVisitor::visitAllocaInst(AllocaInst &I) { + Type *Ty = I.getAllocatedType(); + Type *NewTy = TypeMap->remapType(Ty); + if (Ty == NewTy) + return false; + I.setAllocatedType(NewTy); + return true; +} + +bool StoreFatPtrsAsIntsVisitor::visitGetElementPtrInst(GetElementPtrInst &I) { + Type *Ty = I.getSourceElementType(); + Type *NewTy = TypeMap->remapType(Ty); + if (Ty == NewTy) + return false; + // We'll be rewriting the type `ptr addrspace(7)` out of existence soon, so + // make sure GEPs don't have different semantics with the new type. + I.setSourceElementType(NewTy); + I.setResultElementType(TypeMap->remapType(I.getResultElementType())); + return true; +} + +bool StoreFatPtrsAsIntsVisitor::visitLoadInst(LoadInst &LI) { + Type *Ty = LI.getType(); + Type *IntTy = TypeMap->remapType(Ty); + if (Ty == IntTy) + return false; + + IRB.SetInsertPoint(&LI); + auto *NLI = cast(LI.clone()); + NLI->mutateType(IntTy); + NLI = IRB.Insert(NLI); + copyMetadataForLoad(*NLI, LI); + NLI->takeName(&LI); + + Value *CastBack = intsToFatPtrs(NLI, IntTy, Ty, NLI->getName()); + LI.replaceAllUsesWith(CastBack); + LI.eraseFromParent(); + return true; +} + +bool StoreFatPtrsAsIntsVisitor::visitStoreInst(StoreInst &SI) { + Value *V = SI.getValueOperand(); + Type *Ty = V->getType(); + Type *IntTy = TypeMap->remapType(Ty); + if (Ty == IntTy) + return false; + + IRB.SetInsertPoint(&SI); + Value *IntV = fatPtrsToInts(V, Ty, IntTy, V->getName()); + for (auto *Dbg : at::getAssignmentMarkers(&SI)) + Dbg->setValue(IntV); + + SI.setOperand(0, IntV); + return true; +} + +/// Return the ptr addrspace(8) and i32 (resource and offset parts) in a lowered +/// buffer fat pointer constant. +static std::pair +splitLoweredFatBufferConst(Constant *C) { + if (auto *AZ = dyn_cast(C)) + return std::make_pair(AZ->getStructElement(0), AZ->getStructElement(1)); + if (auto *SC = dyn_cast(C)) + return std::make_pair(SC->getOperand(0), SC->getOperand(1)); + llvm_unreachable("Conversion should've created a {p8, i32} struct"); +} + +namespace { +/// Handle the remapping of ptr addrspace(7) constants. +class FatPtrConstMaterializer final : public ValueMaterializer { + BufferFatPtrToStructTypeMap *TypeMap; + BufferFatPtrToIntTypeMap *IntTypeMap; + // An internal mapper that is used to recurse into the arguments of constants. + // While the documentation for `ValueMapper` specifies not to use it + // recursively, examination of the logic in mapValue() shows that it can + // safely be used recursively when handling constants, like it does in its own + // logic. + ValueMapper InternalMapper; + + Constant *materializeBufferFatPtrConst(Constant *C); + + const DataLayout &DL; + +public: + // UnderlyingMap is the value map this materializer will be filling. + FatPtrConstMaterializer(BufferFatPtrToStructTypeMap *TypeMap, + ValueToValueMapTy &UnderlyingMap, + BufferFatPtrToIntTypeMap *IntTypeMap, + const DataLayout &DL) + : TypeMap(TypeMap), IntTypeMap(IntTypeMap), + InternalMapper(UnderlyingMap, RF_None, TypeMap, this), DL(DL) {} + virtual ~FatPtrConstMaterializer() = default; + + Value *materialize(Value *V) override; +}; +} // namespace + +Constant *FatPtrConstMaterializer::materializeBufferFatPtrConst(Constant *C) { + Type *SrcTy = C->getType(); + auto *NewTy = dyn_cast(TypeMap->remapType(SrcTy)); + if (C->isNullValue()) + return ConstantAggregateZero::getNullValue(NewTy); + if (isa(C)) + return ConstantStruct::get(NewTy, + {PoisonValue::get(NewTy->getElementType(0)), + PoisonValue::get(NewTy->getElementType(1))}); + if (isa(C)) + return ConstantStruct::get(NewTy, + {UndefValue::get(NewTy->getElementType(0)), + UndefValue::get(NewTy->getElementType(1))}); + + if (isa(C)) + report_fatal_error("Global values containing ptr addrspace(7) (buffer " + "fat pointer) values are not supported"); + + if (auto *VC = dyn_cast(C)) { + if (Constant *S = VC->getSplatValue()) { + Constant *NewS = InternalMapper.mapConstant(*S); + if (!NewS) + return nullptr; + auto [Rsrc, Off] = splitLoweredFatBufferConst(NewS); + auto EC = VC->getType()->getElementCount(); + return ConstantStruct::get(NewTy, {ConstantVector::getSplat(EC, Rsrc), + ConstantVector::getSplat(EC, Off)}); + } + SmallVector Rsrcs; + SmallVector Offs; + for (Value *Op : VC->operand_values()) { + auto *NewOp = dyn_cast_or_null(InternalMapper.mapValue(*Op)); + if (!NewOp) + return nullptr; + auto [Rsrc, Off] = splitLoweredFatBufferConst(NewOp); + Rsrcs.push_back(Rsrc); + Offs.push_back(Off); + } + Constant *RsrcVec = ConstantVector::get(Rsrcs); + Constant *OffVec = ConstantVector::get(Offs); + return ConstantStruct::get(NewTy, {RsrcVec, OffVec}); + } + + // Constant expressions. This code mirrors how we fix up the equivalent + // instructions later. + auto *CE = dyn_cast(C); + if (!CE) + return nullptr; + if (auto *GEPO = dyn_cast(C)) { + Constant *RemappedPtr = + InternalMapper.mapConstant(*cast(GEPO->getPointerOperand())); + auto [Rsrc, Off] = splitLoweredFatBufferConst(RemappedPtr); + Type *OffTy = Off->getType(); + bool InBounds = GEPO->isInBounds(); + + MapVector VariableOffs; + APInt NewConstOffVal = APInt::getZero(BufferOffsetWidth); + if (!GEPO->collectOffset(DL, BufferOffsetWidth, VariableOffs, + NewConstOffVal)) + report_fatal_error( + "Scalable vector or unsized struct in fat pointer GEP"); + Constant *OffAccum = nullptr; + // Accumulate offsets together before adding to the base in order to + // preserve as many of the inbounds properties as possible. + for (auto [Arg, Multiple] : VariableOffs) { + Constant *NewArg = InternalMapper.mapConstant(*cast(Arg)); + NewArg = CE->getIntegerCast(NewArg, OffTy, /*IsSigned=*/true); + if (Multiple.isPowerOf2()) + NewArg = ConstantExpr::getShl( + NewArg, + CE->getIntegerValue(OffTy, + APInt(BufferOffsetWidth, Multiple.logBase2())), + /*hasNUW=*/InBounds, /*HasNSW=*/InBounds); + else + NewArg = + ConstantExpr::getMul(NewArg, CE->getIntegerValue(OffTy, Multiple), + /*hasNUW=*/InBounds, /*hasNSW=*/InBounds); + if (OffAccum) + OffAccum = ConstantExpr::getAdd(OffAccum, NewArg, /*hasNUW=*/InBounds, + /*hasNSW=*/InBounds); + else + OffAccum = NewArg; + } + Constant *NewConstOff = CE->getIntegerValue(OffTy, NewConstOffVal); + if (OffAccum) + OffAccum = ConstantExpr::getAdd(OffAccum, NewConstOff, + /*hasNUW=*/InBounds, /*hasNSW=*/InBounds); + else + OffAccum = NewConstOff; + bool HasNonNegativeOff = false; + if (auto *CI = dyn_cast(OffAccum)) { + HasNonNegativeOff = !CI->isNegative(); + } + Constant *NewOff = ConstantExpr::getAdd( + Off, OffAccum, /*hasNUW=*/InBounds && HasNonNegativeOff, + /*hasNSW=*/false); + return ConstantStruct::get(NewTy, {Rsrc, NewOff}); + } + + if (auto *PI = dyn_cast(CE)) { + Constant *Parts = + InternalMapper.mapConstant(*cast(PI->getPointerOperand())); + auto [Rsrc, Off] = splitLoweredFatBufferConst(Parts); + // Here, we take advantage of the fact that ptrtoint has a built-in + // zero-extension behavior. + unsigned FatPtrWidth = + DL.getPointerSizeInBits(AMDGPUAS::BUFFER_FAT_POINTER); + Constant *RsrcInt = CE->getPtrToInt(Rsrc, SrcTy); + unsigned Width = SrcTy->getScalarSizeInBits(); + Constant *Shift = + CE->getIntegerValue(SrcTy, APInt(Width, BufferOffsetWidth)); + Constant *OffCast = CE->getIntegerCast(Off, SrcTy, /*IsSigned=*/false); + Constant *RsrcHi = ConstantExpr::getShl( + RsrcInt, Shift, Width >= FatPtrWidth, Width > FatPtrWidth); + // This should be an or, but those got recently removed. + Constant *Result = ConstantExpr::getAdd(RsrcHi, OffCast, true, true); + return Result; + } + + if (CE->getOpcode() == Instruction::IntToPtr) { + auto *Arg = cast(CE->getOperand(0)); + unsigned FatPtrWidth = + DL.getPointerSizeInBits(AMDGPUAS::BUFFER_FAT_POINTER); + unsigned RsrcPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_RESOURCE); + auto *WantedTy = Arg->getType()->getWithNewBitWidth(FatPtrWidth); + Arg = CE->getIntegerCast(Arg, WantedTy, /*IsSigned=*/false); + + Constant *Shift = + CE->getIntegerValue(WantedTy, APInt(FatPtrWidth, BufferOffsetWidth)); + Type *RsrcIntType = WantedTy->getWithNewBitWidth(RsrcPtrWidth); + Type *RsrcTy = NewTy->getElementType(0); + Type *OffTy = WantedTy->getWithNewBitWidth(BufferOffsetWidth); + Constant *RsrcInt = CE->getTrunc(CE->getLShr(Arg, Shift), RsrcIntType); + Constant *Rsrc = CE->getIntToPtr(RsrcInt, RsrcTy); + Constant *Off = CE->getIntegerCast(Arg, OffTy, /*isSigned=*/false); + + return ConstantStruct::get(NewTy, {Rsrc, Off}); + } + + if (auto *AC = dyn_cast(CE)) { + unsigned SrcAS = AC->getSrcAddressSpace(); + unsigned DstAS = AC->getDestAddressSpace(); + auto *Arg = cast(AC->getPointerOperand()); + auto *NewArg = InternalMapper.mapConstant(*Arg); + if (!NewArg) + return nullptr; + if (SrcAS == AMDGPUAS::BUFFER_FAT_POINTER && + DstAS == AMDGPUAS::BUFFER_FAT_POINTER) + return NewArg; + if (SrcAS == AMDGPUAS::BUFFER_RESOURCE && + DstAS == AMDGPUAS::BUFFER_FAT_POINTER) { + auto *NullOff = CE->getNullValue(NewTy->getElementType(1)); + return ConstantStruct::get(NewTy, {NewArg, NullOff}); + } + report_fatal_error( + "Unsupported address space cast for a buffer fat pointer"); + } + return nullptr; +} + +Value *FatPtrConstMaterializer::materialize(Value *V) { + Constant *C = dyn_cast(V); + if (!C) + return nullptr; + if (auto *GEPO = dyn_cast(C)) { + // As a special case, adjust GEP constants that have a ptr addrspace(7) in + // their source types here, since the earlier local changes didn't handle + // htis. + Type *SrcTy = GEPO->getSourceElementType(); + Type *NewSrcTy = IntTypeMap->remapType(SrcTy); + if (SrcTy != NewSrcTy) { + SmallVector Ops; + Ops.reserve(GEPO->getNumOperands()); + for (const Use &U : GEPO->operands()) + Ops.push_back(cast(U.get())); + auto *NewGEP = ConstantExpr::getGetElementPtr( + NewSrcTy, Ops[0], ArrayRef(Ops).slice(1), + GEPO->isInBounds(), GEPO->getInRangeIndex()); + LLVM_DEBUG(llvm::dbgs() << "p7-getting GEP: " << *GEPO << " becomes " + << *NewGEP << "\n"); + Value *FurtherMap = materialize(NewGEP); + return FurtherMap ? FurtherMap : NewGEP; + } + } + // Structs and other types that happen to contain fat pointers get remapped + // by the mapValue() logic. + if (!isBufferFatPtrConst(C)) + return nullptr; + return materializeBufferFatPtrConst(C); +} + +using PtrParts = std::pair; +namespace { +union CoherentFlag { + struct { + unsigned Glc : 1; // Global level coherence + unsigned Slc : 1; // System level coherence + unsigned Dlc : 1; // Device level coherence + unsigned Swz : 1; // Swizzled buffer + unsigned : 28; + } Bits; + + unsigned U32All; +}; + +// The visitor returns the resource and offset parts for an instruction if they +// can be computed, or (nullptr, nullptr) for cases that don't have a meaningful +// value mapping. +class SplitPtrStructs : public InstVisitor { + ValueToValueMapTy RsrcParts; + ValueToValueMapTy OffParts; + + // Track instructions that have been rewritten into a user of the component + // parts of their ptr addrspace(7) input. Instructions that produced + // ptr addrspace(7) parts should **not** be RAUW'd before being added to this + // set, as that replacement will be handled in a post-visit step. However, + // instructions that yield values that aren't fat pointers (ex. ptrtoint) + // should RAUW themselves with new instructions that use the split parts + // of their arguments during processing. + DenseSet SplitUsers; + + // Nodes that need a second look once we've computed the parts for all other + // instructions to see if, for example, we really need to phi on the resource + // part. + SmallVector Conditionals; + // Temporary instructions produced while lowering conditionals that should be + // killed. + SmallVector ConditionalTemps; + + // Subtarget info, needed for determining what cache control bits to set. + const TargetMachine *TM; + const GCNSubtarget *ST; + + IRBuilder<> IRB; + + // Copy metadata between instructions if applicable. + void copyMetadata(Value *Dest, Value *Src); + + // Get the resource and offset parts of the value V, inserting appropriate + // extractvalue calls if needed. + PtrParts getPtrParts(Value *V); + + // Given an instruction that could produce multiple resource parts (a PHI or + // select), collect the set of possible instructions that could have provided + // its resource parts that it could have (the `Roots`) and the set of + // conditional instructions visited during the search (`Seen`). If, after + // removing the root of the search from `Seen` and `Roots`, `Seen` is a subset + // of `Roots` and `Roots - Seen` contains one element, the resource part of + // that element can replace the resource part of all other elements in `Seen`. + void getPossibleRsrcRoots(Instruction *I, SmallPtrSetImpl &Roots, + SmallPtrSetImpl &Seen); + void processConditionals(); + + // If an instruction hav been split into resource and offset parts, + // delete that instruction. If any of its uses have not themselves been split + // into parts (for example, an insertvalue), construct the structure + // that the type rewrites declared should be produced by the dying instruction + // and use that. + // Also, kill the temporary extractvalue operations produced by the two-stage + // lowering of PHIs and conditionals. + void killAndReplaceSplitInstructions(SmallVectorImpl &Origs); + + void setMemoryInfo(CallInst *Intr, Align A, bool IsVolatile); + void insertPreMemOpFence(AtomicOrdering Order, SyncScope::ID SSID); + void insertPostMemOpFence(AtomicOrdering Order, SyncScope::ID SSID); + Value *handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr, Type *Ty, + Align Alignment, AtomicOrdering Order, + bool IsVolatile, SyncScope::ID SSID); + +public: + SplitPtrStructs(LLVMContext &Ctx, const TargetMachine *TM) + : TM(TM), ST(nullptr), IRB(Ctx) {} + + void processFunction(Function &F); + + // The collected set of intrinsic declarations that have had their type + // mangled and that can be deleted as unneeded. + SmallPtrSet IntrinsicDeclsToRemove; + + PtrParts visitInstruction(Instruction &I); + PtrParts visitLoadInst(LoadInst &LI); + PtrParts visitStoreInst(StoreInst &SI); + PtrParts visitAtomicRMWInst(AtomicRMWInst &AI); + PtrParts visitAtomicCmpXchgInst(AtomicCmpXchgInst &AI); + PtrParts visitGetElementPtrInst(GetElementPtrInst &GEP); + + PtrParts visitPtrToIntInst(PtrToIntInst &PI); + PtrParts visitIntToPtrInst(IntToPtrInst &IP); + PtrParts visitAddrSpaceCastInst(AddrSpaceCastInst &I); + PtrParts visitICmpInst(ICmpInst &Cmp); + PtrParts visitFreezeInst(FreezeInst &I); + + PtrParts visitExtractElementInst(ExtractElementInst &I); + PtrParts visitInsertElementInst(InsertElementInst &I); + PtrParts visitShuffleVectorInst(ShuffleVectorInst &I); + + PtrParts visitPHINode(PHINode &PHI); + PtrParts visitSelectInst(SelectInst &SI); + + PtrParts visitIntrinsicInst(IntrinsicInst &II); +}; +} // namespace + +void SplitPtrStructs::copyMetadata(Value *Dest, Value *Src) { + auto *DestI = dyn_cast(Dest); + auto *SrcI = dyn_cast(Src); + + if (!DestI || !SrcI) + return; + + DestI->copyMetadata(*SrcI); +} + +PtrParts SplitPtrStructs::getPtrParts(Value *V) { + assert(isSplitFatPtr(V->getType()) && "it's not meaningful to get the parts " + "of something that wasn't rewritten"); + auto *RsrcEntry = &RsrcParts[V]; + auto *OffEntry = &OffParts[V]; + if (*RsrcEntry && *OffEntry) + return {*RsrcEntry, *OffEntry}; + + if (auto *C = dyn_cast(V)) { + auto [Rsrc, Off] = splitLoweredFatBufferConst(C); + return {*RsrcEntry = Rsrc, *OffEntry = Off}; + } + + IRBuilder<>::InsertPointGuard Guard(IRB); + if (auto *I = dyn_cast(V)) { + LLVM_DEBUG(llvm::dbgs() << "Recursing to split parts of " << *I << "\n"); + auto [Rsrc, Off] = visit(*I); + if (Rsrc && Off) + return {*RsrcEntry = Rsrc, *OffEntry = Off}; + // We'll be creating the new values after the relevant instruction. + IRB.SetInsertPoint(I->getInsertionPointAfterDef()); + IRB.SetCurrentDebugLocation(I->getDebugLoc()); + } else if (auto *A = dyn_cast(V)) { + IRB.SetInsertPointPastAllocas(A->getParent()); + IRB.SetCurrentDebugLocation(DebugLoc()); + } + Value *Rsrc = IRB.CreateExtractValue(V, 0, V->getName() + ".rsrc"); + Value *Off = IRB.CreateExtractValue(V, 1, V->getName() + ".off"); + return {*RsrcEntry = Rsrc, *OffEntry = Off}; +} + +/// Returns the instruction that defines the resource part of the value V. +/// Note that this is not getUnderlyingObject(), since that looks through +/// operations like ptrmask which might modify the resource part. +/// +/// We can limit ourselves to just looking through GEPs followed by looking +/// through addrspacecasts because only those two operations preserve the +/// resource part, and because operations on an `addrspace(8)` (which is the +/// legal input to this addrspacecast) would produce a different resource part. +static Value *rsrcPartRoot(Value *V) { + while (auto *GEP = dyn_cast(V)) + V = GEP->getPointerOperand(); + while (auto *ASC = dyn_cast(V)) + V = ASC->getPointerOperand(); + return V; +} + +void SplitPtrStructs::getPossibleRsrcRoots(Instruction *I, + SmallPtrSetImpl &Roots, + SmallPtrSetImpl &Seen) { + if (auto *PHI = dyn_cast(I)) { + if (!Seen.insert(I).second) + return; + for (Value *In : PHI->incoming_values()) { + In = rsrcPartRoot(In); + Roots.insert(In); + if (isa(In)) + getPossibleRsrcRoots(cast(In), Roots, Seen); + } + } else if (auto *SI = dyn_cast(I)) { + if (!Seen.insert(SI).second) + return; + Value *TrueVal = rsrcPartRoot(SI->getTrueValue()); + Value *FalseVal = rsrcPartRoot(SI->getFalseValue()); + Roots.insert(TrueVal); + Roots.insert(FalseVal); + if (isa(TrueVal)) + getPossibleRsrcRoots(cast(TrueVal), Roots, Seen); + if (isa(FalseVal)) + getPossibleRsrcRoots(cast(FalseVal), Roots, Seen); + } else { + llvm_unreachable("getPossibleRsrcParts() only works on phi and select"); + } +} + +void SplitPtrStructs::processConditionals() { + SmallDenseMap FoundRsrcs; + SmallPtrSet Roots; + SmallPtrSet Seen; + for (Instruction *I : Conditionals) { + // These have to exist by now because we've visited these nodes. + Value *Rsrc = RsrcParts[I]; + Value *Off = OffParts[I]; + assert(Rsrc && Off && "must have visited conditionals by now"); + + std::optional MaybeRsrc = std::nullopt; + auto MaybeFoundRsrc = FoundRsrcs.find(I); + if (MaybeFoundRsrc != FoundRsrcs.end()) { + MaybeRsrc = MaybeFoundRsrc->second; + } else { + IRBuilder<>::InsertPointGuard Guard(IRB); + Roots.clear(); + Seen.clear(); + getPossibleRsrcRoots(I, Roots, Seen); + LLVM_DEBUG(llvm::dbgs() << "Processing conditional: " << *I << "\n"); +#ifndef NDEBUG + for (Value *V : Roots) + LLVM_DEBUG(llvm::dbgs() << "Root: " << *V << "\n"); + for (Value *V : Seen) + LLVM_DEBUG(llvm::dbgs() << "Seen: " << *V << "\n"); +#endif + // If we are our own possible root, then we shouldn't block our + // replacement with a valid incoming value. + Roots.erase(I); + // We don't want to block the optimization for conditionals that don't + // refer to themselves but did see themselves during the traversal. + Seen.erase(I); + + if (set_is_subset(Seen, Roots)) { + auto Diff = set_difference(Roots, Seen); + if (Diff.size() == 1) { + Value *RootVal = *Diff.begin(); + // Handle the case where previous loops already looked through + // an addrspacecast. + if (isSplitFatPtr(RootVal->getType())) + MaybeRsrc = std::get<0>(getPtrParts(RootVal)); + else + MaybeRsrc = RootVal; + } + } + } + + if (auto *PHI = dyn_cast(I)) { + Value *NewRsrc; + StructType *PHITy = cast(PHI->getType()); + IRB.SetInsertPoint(PHI->getInsertionPointAfterDef()); + IRB.SetCurrentDebugLocation(PHI->getDebugLoc()); + if (MaybeRsrc) { + NewRsrc = *MaybeRsrc; + } else { + Type *RsrcTy = PHITy->getElementType(0); + auto *RsrcPHI = IRB.CreatePHI(RsrcTy, PHI->getNumIncomingValues()); + RsrcPHI->takeName(Rsrc); + for (auto [V, BB] : llvm::zip(PHI->incoming_values(), PHI->blocks())) { + Value *VRsrc = std::get<0>(getPtrParts(V)); + RsrcPHI->addIncoming(VRsrc, BB); + } + copyMetadata(RsrcPHI, PHI); + NewRsrc = RsrcPHI; + } + + Type *OffTy = PHITy->getElementType(1); + auto *NewOff = IRB.CreatePHI(OffTy, PHI->getNumIncomingValues()); + NewOff->takeName(Off); + for (auto [V, BB] : llvm::zip(PHI->incoming_values(), PHI->blocks())) { + assert(OffParts.count(V) && "An offset part had to be created by now"); + Value *VOff = std::get<1>(getPtrParts(V)); + NewOff->addIncoming(VOff, BB); + } + copyMetadata(NewOff, PHI); + + // Note: We don't eraseFromParent() the temporaries because we don't want + // to put the corrections maps in an inconstent state. That'll be handed + // during the rest of the killing. Also, `ValueToValueMapTy` guarantees + // that references in that map will be updated as well. + ConditionalTemps.push_back(cast(Rsrc)); + ConditionalTemps.push_back(cast(Off)); + Rsrc->replaceAllUsesWith(NewRsrc); + Off->replaceAllUsesWith(NewOff); + + // Save on recomputing the cycle traversals in known-root cases. + if (MaybeRsrc) + for (Value *V : Seen) + FoundRsrcs[cast(V)] = NewRsrc; + } else if (auto *SI = dyn_cast(I)) { + if (MaybeRsrc) { + ConditionalTemps.push_back(cast(Rsrc)); + Rsrc->replaceAllUsesWith(*MaybeRsrc); + for (Value *V : Seen) + FoundRsrcs[cast(V)] = *MaybeRsrc; + } + } else { + llvm_unreachable("Only PHIs and selects go in the conditionals list"); + } + } +} + +void SplitPtrStructs::killAndReplaceSplitInstructions( + SmallVectorImpl &Origs) { + for (Instruction *I : ConditionalTemps) + I->eraseFromParent(); + + for (Instruction *I : Origs) { + if (!SplitUsers.contains(I)) + continue; + + SmallVector Dbgs; + findDbgValues(Dbgs, I); + for (auto *Dbg : Dbgs) { + IRB.SetInsertPoint(Dbg); + auto &DL = I->getModule()->getDataLayout(); + assert(isSplitFatPtr(I->getType()) && + "We should've RAUW'd away loads, stores, etc. at this point"); + auto *OffDbg = cast(Dbg->clone()); + copyMetadata(OffDbg, Dbg); + auto [Rsrc, Off] = getPtrParts(I); + + int64_t RsrcSz = DL.getTypeSizeInBits(Rsrc->getType()); + int64_t OffSz = DL.getTypeSizeInBits(Off->getType()); + + std::optional RsrcExpr = + DIExpression::createFragmentExpression(Dbg->getExpression(), 0, + RsrcSz); + std::optional OffExpr = + DIExpression::createFragmentExpression(Dbg->getExpression(), RsrcSz, + OffSz); + if (OffExpr) { + OffDbg->setExpression(*OffExpr); + OffDbg->replaceVariableLocationOp(I, Off); + IRB.Insert(OffDbg); + } else { + delete OffDbg; + } + if (RsrcExpr) { + Dbg->setExpression(*RsrcExpr); + Dbg->replaceVariableLocationOp(I, Rsrc); + } else { + Dbg->replaceVariableLocationOp(I, UndefValue::get(I->getType())); + } + } + + Value *Poison = PoisonValue::get(I->getType()); + I->replaceUsesWithIf(Poison, [&](const Use &U) -> bool { + if (const auto *UI = dyn_cast(U.getUser())) + return SplitUsers.contains(UI); + return false; + }); + + if (I->use_empty()) { + I->eraseFromParent(); + continue; + } + IRB.SetInsertPoint(I->getInsertionPointAfterDef()); + IRB.SetCurrentDebugLocation(I->getDebugLoc()); + auto [Rsrc, Off] = getPtrParts(I); + Value *Struct = PoisonValue::get(I->getType()); + Struct = IRB.CreateInsertValue(Struct, Rsrc, 0); + Struct = IRB.CreateInsertValue(Struct, Off, 1); + copyMetadata(Struct, I); + Struct->takeName(I); + I->replaceAllUsesWith(Struct); + I->eraseFromParent(); + } +} + +void SplitPtrStructs::setMemoryInfo(CallInst *Intr, Align A, bool IsVolatile) { + LLVMContext &Ctx = Intr->getContext(); + SmallVector AlignData = { + ConstantAsMetadata::get(IRB.getInt64(A.value()))}; + Intr->setMetadata("amdgpu.align", MDNode::get(Ctx, AlignData)); + if (IsVolatile) + Intr->setMetadata("amdgpu.volatile", MDNode::get(Ctx, {})); +} + +void SplitPtrStructs::insertPreMemOpFence(AtomicOrdering Order, + SyncScope::ID SSID) { + switch (Order) { + case AtomicOrdering::Release: + case AtomicOrdering::AcquireRelease: + case AtomicOrdering::SequentiallyConsistent: + IRB.CreateFence(AtomicOrdering::Release, SSID); + break; + default: + break; + } +} + +void SplitPtrStructs::insertPostMemOpFence(AtomicOrdering Order, + SyncScope::ID SSID) { + switch (Order) { + case AtomicOrdering::Acquire: + case AtomicOrdering::AcquireRelease: + case AtomicOrdering::SequentiallyConsistent: + IRB.CreateFence(AtomicOrdering::Acquire, SSID); + break; + default: + break; + } +} + +Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr, + Type *Ty, Align Alignment, + AtomicOrdering Order, bool IsVolatile, + SyncScope::ID SSID) { + IRB.SetInsertPoint(I); + + auto [Rsrc, Off] = getPtrParts(Ptr); + SmallVector Args; + if (Arg) + Args.push_back(Arg); + Args.push_back(Rsrc); + Args.push_back(Off); + insertPreMemOpFence(Order, SSID); + // soffset is always 0 for these cases, where we always want any offset to be + // part of bounds checking and we don't know which parts of the GEPs is + // uniform. + Args.push_back(IRB.getInt32(0)); + + CoherentFlag Aux; + Aux.U32All = 0; + bool IsInvariant = + (isa(I) && I->getMetadata(LLVMContext::MD_invariant_load)); + bool IsNonTemporal = I->getMetadata(LLVMContext::MD_nontemporal); + // Atomic loads and stores need glc, atomic read-modify-write doesn't. + bool IsOneWayAtomic = + !isa(I) && Order != AtomicOrdering::NotAtomic; + Aux.Bits.Glc = IsOneWayAtomic; + if (!IsInvariant) + Aux.Bits.Slc = IsNonTemporal; + if (isa(I) && ST->getGeneration() == AMDGPUSubtarget::GFX10) + Aux.Bits.Dlc = Aux.Bits.Glc; + Args.push_back(IRB.getInt32(Aux.U32All)); + + Intrinsic::ID IID = Intrinsic::not_intrinsic; + if (isa(I)) + // TODO: Do we need to do something about atomic loads? + IID = Intrinsic::amdgcn_raw_ptr_buffer_load; + else if (isa(I)) + IID = Intrinsic::amdgcn_raw_ptr_buffer_store; + else if (auto *RMW = dyn_cast(I)) { + switch (RMW->getOperation()) { + case AtomicRMWInst::Xchg: + IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap; + break; + case AtomicRMWInst::Add: + IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_add; + break; + case AtomicRMWInst::Sub: + IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub; + break; + case AtomicRMWInst::And: + IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_and; + break; + case AtomicRMWInst::Or: + IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_or; + break; + case AtomicRMWInst::Xor: + IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor; + break; + case AtomicRMWInst::Max: + IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax; + break; + case AtomicRMWInst::Min: + IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin; + break; + case AtomicRMWInst::UMax: + IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax; + break; + case AtomicRMWInst::UMin: + IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin; + break; + case AtomicRMWInst::FAdd: + IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd; + break; + case AtomicRMWInst::FMax: + IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax; + break; + case AtomicRMWInst::FMin: + IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin; + break; + case AtomicRMWInst::FSub: { + report_fatal_error("atomic floating point subtraction not supported for " + "buffer resources and should've been expanded away"); + break; + } + case AtomicRMWInst::Nand: + report_fatal_error("atomic nand not supported for buffer resources and " + "should've been expanded away"); + break; + case AtomicRMWInst::UIncWrap: + case AtomicRMWInst::UDecWrap: + report_fatal_error("wrapping increment/decrement not supported for " + "buffer resources and should've ben expanded away"); + break; + case AtomicRMWInst::BAD_BINOP: + llvm_unreachable("Not sure how we got a bad binop"); + } + } + + auto *Call = IRB.CreateIntrinsic(IID, Ty, Args); + copyMetadata(Call, I); + setMemoryInfo(Call, Alignment, IsVolatile); + Call->takeName(I); + + insertPostMemOpFence(Order, SSID); + // The "no moving p7 directly" rewrites ensure that this load or store won't + // itself need to be split into parts. + SplitUsers.insert(I); + I->replaceAllUsesWith(Call); + return Call; +} + +PtrParts SplitPtrStructs::visitInstruction(Instruction &I) { + return {nullptr, nullptr}; +} + +PtrParts SplitPtrStructs::visitLoadInst(LoadInst &LI) { + if (!isSplitFatPtr(LI.getPointerOperandType())) + return {nullptr, nullptr}; + handleMemoryInst(&LI, nullptr, LI.getPointerOperand(), LI.getType(), + LI.getAlign(), LI.getOrdering(), LI.isVolatile(), + LI.getSyncScopeID()); + return {nullptr, nullptr}; +} + +PtrParts SplitPtrStructs::visitStoreInst(StoreInst &SI) { + if (!isSplitFatPtr(SI.getPointerOperandType())) + return {nullptr, nullptr}; + Value *Arg = SI.getValueOperand(); + handleMemoryInst(&SI, Arg, SI.getPointerOperand(), Arg->getType(), + SI.getAlign(), SI.getOrdering(), SI.isVolatile(), + SI.getSyncScopeID()); + return {nullptr, nullptr}; +} + +PtrParts SplitPtrStructs::visitAtomicRMWInst(AtomicRMWInst &AI) { + if (!isSplitFatPtr(AI.getPointerOperand()->getType())) + return {nullptr, nullptr}; + Value *Arg = AI.getValOperand(); + handleMemoryInst(&AI, Arg, AI.getPointerOperand(), Arg->getType(), + AI.getAlign(), AI.getOrdering(), AI.isVolatile(), + AI.getSyncScopeID()); + return {nullptr, nullptr}; +} + +// Unlike load, store, and RMW, cmpxchg needs special handling to account +// for the boolean argument. +PtrParts SplitPtrStructs::visitAtomicCmpXchgInst(AtomicCmpXchgInst &AI) { + Value *Ptr = AI.getPointerOperand(); + if (!isSplitFatPtr(Ptr->getType())) + return {nullptr, nullptr}; + IRB.SetInsertPoint(&AI); + + Type *Ty = AI.getNewValOperand()->getType(); + AtomicOrdering Order = AI.getMergedOrdering(); + SyncScope::ID SSID = AI.getSyncScopeID(); + bool IsNonTemporal = AI.getMetadata(LLVMContext::MD_nontemporal); + + auto [Rsrc, Off] = getPtrParts(Ptr); + insertPreMemOpFence(Order, SSID); + + CoherentFlag Aux; + Aux.U32All = 0; + Aux.Bits.Slc = IsNonTemporal; + auto *Call = + IRB.CreateIntrinsic(Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap, Ty, + {AI.getNewValOperand(), AI.getCompareOperand(), Rsrc, + Off, IRB.getInt32(0), IRB.getInt32(Aux.U32All)}); + copyMetadata(Call, &AI); + setMemoryInfo(Call, AI.getAlign(), AI.isVolatile()); + Call->takeName(&AI); + insertPostMemOpFence(Order, SSID); + + Value *Res = PoisonValue::get(AI.getType()); + Res = IRB.CreateInsertValue(Res, Call, 0); + if (!AI.isWeak()) { + Value *Succeeded = IRB.CreateICmpEQ(Call, AI.getCompareOperand()); + Res = IRB.CreateInsertValue(Res, Succeeded, 1); + } + SplitUsers.insert(&AI); + AI.replaceAllUsesWith(Res); + return {nullptr, nullptr}; +} + +PtrParts SplitPtrStructs::visitGetElementPtrInst(GetElementPtrInst &GEP) { + Value *Ptr = GEP.getPointerOperand(); + if (!isSplitFatPtr(Ptr->getType())) + return {nullptr, nullptr}; + IRB.SetInsertPoint(&GEP); + + auto [Rsrc, Off] = getPtrParts(Ptr); + Type *OffTy = Off->getType(); + const DataLayout &DL = GEP.getModule()->getDataLayout(); + bool InBounds = GEP.isInBounds(); + + // In order to call collectOffset() and thus not have to reimplement it, + // we need the GEP's pointer operand to have ptr addrspace(7) type + GEP.setOperand(GEP.getPointerOperandIndex(), + PoisonValue::get(IRB.getPtrTy(AMDGPUAS::BUFFER_FAT_POINTER))); + MapVector VariableOffs; + APInt ConstOffVal = APInt::getZero(BufferOffsetWidth); + if (!GEP.collectOffset(DL, BufferOffsetWidth, VariableOffs, ConstOffVal)) + report_fatal_error("Scalable vector or unsized struct in fat pointer GEP"); + GEP.setOperand(GEP.getPointerOperandIndex(), Ptr); + Value *OffAccum = nullptr; + // Accumulate offsets together before adding to the base in order to preserve + // as many of the inbounds properties as possible. + for (auto [Arg, Multiple] : VariableOffs) { + if (auto *OffVecTy = dyn_cast(OffTy)) + if (!Arg->getType()->isVectorTy()) + Arg = IRB.CreateVectorSplat(OffVecTy->getElementCount(), Arg); + Arg = IRB.CreateIntCast(Arg, OffTy, /*isSigned=*/true); + if (Multiple.isPowerOf2()) + Arg = IRB.CreateShl(Arg, BufferOffsetWidth, "", /*hasNUW=*/InBounds, + /*HasNSW=*/InBounds); + else + Arg = IRB.CreateMul(Arg, ConstantExpr::getIntegerValue(OffTy, Multiple), + "", /*hasNUW=*/InBounds, /*hasNSW=*/InBounds); + if (OffAccum) + OffAccum = IRB.CreateAdd(OffAccum, Arg, "", /*hasNUW=*/InBounds, + /*hasNSW=*/InBounds); + else + OffAccum = Arg; + } + Constant *ConstOff = ConstantExpr::getIntegerValue(OffTy, ConstOffVal); + if (OffAccum) + OffAccum = IRB.CreateAdd(OffAccum, ConstOff, "", /*hasNUW=*/InBounds, + /*hasNSW=*/InBounds); + else + OffAccum = ConstOff; + bool HasNonNegativeOff = false; + if (auto *CI = dyn_cast(OffAccum)) { + HasNonNegativeOff = !CI->isNegative(); + } + Value *NewOff = + IRB.CreateAdd(Off, OffAccum, "", /*hasNUW=*/InBounds && HasNonNegativeOff, + /*hasNSW=*/false); + copyMetadata(NewOff, &GEP); + NewOff->takeName(&GEP); + SplitUsers.insert(&GEP); + return {Rsrc, NewOff}; +} + +PtrParts SplitPtrStructs::visitPtrToIntInst(PtrToIntInst &PI) { + Value *Ptr = PI.getPointerOperand(); + if (!isSplitFatPtr(Ptr->getType())) + return {nullptr, nullptr}; + IRB.SetInsertPoint(&PI); + + Type *ResTy = PI.getType(); + unsigned Width = ResTy->getScalarSizeInBits(); + + auto [Rsrc, Off] = getPtrParts(Ptr); + const DataLayout &DL = PI.getModule()->getDataLayout(); + unsigned FatPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_FAT_POINTER); + + Value *RsrcInt; + if (Width <= BufferOffsetWidth) + RsrcInt = ConstantExpr::getIntegerValue(ResTy, APInt::getZero(Width)); + else + RsrcInt = IRB.CreatePtrToInt(Rsrc, ResTy, PI.getName() + ".rsrc"); + copyMetadata(RsrcInt, &PI); + + Value *Shl = IRB.CreateShl( + RsrcInt, + ConstantExpr::getIntegerValue(ResTy, APInt(Width, BufferOffsetWidth)), "", + Width >= FatPtrWidth, Width > FatPtrWidth); + Value *OffCast = IRB.CreateIntCast(Off, ResTy, /*isSigned=*/false, PI.getName() + ".off"); + Value *Res = IRB.CreateOr(Shl, OffCast); + Res->takeName(&PI); + SplitUsers.insert(&PI); + PI.replaceAllUsesWith(Res); + return {nullptr, nullptr}; +} + +PtrParts SplitPtrStructs::visitIntToPtrInst(IntToPtrInst &IP) { + if (!isSplitFatPtr(IP.getType())) + return {nullptr, nullptr}; + IRB.SetInsertPoint(&IP); + const DataLayout &DL = IP.getModule()->getDataLayout(); + unsigned RsrcPtrWidth = DL.getPointerSizeInBits(AMDGPUAS::BUFFER_RESOURCE); + Value *Int = IP.getOperand(0); + Type *IntTy = Int->getType(); + Type *RsrcIntTy = IntTy->getWithNewBitWidth(RsrcPtrWidth); + unsigned Width = IntTy->getScalarSizeInBits(); + + auto *RetTy = cast(IP.getType()); + Type *RsrcTy = RetTy->getElementType(0); + Type *OffTy = RetTy->getElementType(1); + Value *RsrcPart = IRB.CreateLShr( + Int, + ConstantExpr::getIntegerValue(IntTy, APInt(Width, BufferOffsetWidth))); + Value *RsrcInt = IRB.CreateIntCast(RsrcPart, RsrcIntTy, /*isSigned=*/false); + Value *Rsrc = IRB.CreateIntToPtr(RsrcInt, RsrcTy, IP.getName() + ".rsrc"); + Value *Off = IRB.CreateIntCast(Int, OffTy, /*IsSigned=*/false, IP.getName() + ".off"); + + copyMetadata(Rsrc, &IP); + SplitUsers.insert(&IP); + return {Rsrc, Off}; +} + +PtrParts SplitPtrStructs::visitAddrSpaceCastInst(AddrSpaceCastInst &I) { + if (!isSplitFatPtr(I.getType())) + return {nullptr, nullptr}; + IRB.SetInsertPoint(&I); + Value *In = I.getPointerOperand(); + // No-op casts preserve parts + if (In->getType() == I.getType()) { + auto [Rsrc, Off] = getPtrParts(In); + SplitUsers.insert(&I); + return {Rsrc, Off}; + } + if (I.getSrcAddressSpace() != AMDGPUAS::BUFFER_RESOURCE) + report_fatal_error("Only buffer resources (addrspace 8) can be cast to " + "buffer fat pointers (addrspace 7)"); + Type *OffTy = cast(I.getType())->getElementType(1); + Value *ZeroOff = Constant::getNullValue(OffTy); + SplitUsers.insert(&I); + return {In, ZeroOff}; +} + +PtrParts SplitPtrStructs::visitICmpInst(ICmpInst &Cmp) { + Value *Lhs = Cmp.getOperand(0); + if (!isSplitFatPtr(Lhs->getType())) + return {nullptr, nullptr}; + Value *Rhs = Cmp.getOperand(1); + IRB.SetInsertPoint(&Cmp); + ICmpInst::Predicate Pred = Cmp.getPredicate(); + + assert((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) && + "Pointer comparison is only equal or unequal"); + auto [LhsRsrc, LhsOff] = getPtrParts(Lhs); + auto [RhsRsrc, RhsOff] = getPtrParts(Rhs); + Value *RsrcCmp = + IRB.CreateICmp(Pred, LhsRsrc, RhsRsrc, Cmp.getName() + ".rsrc"); + copyMetadata(RsrcCmp, &Cmp); + Value *OffCmp = IRB.CreateICmp(Pred, LhsOff, RhsOff, Cmp.getName() + ".off"); + copyMetadata(OffCmp, &Cmp); + + Value *Res = nullptr; + if (Pred == ICmpInst::ICMP_EQ) + Res = IRB.CreateAnd(RsrcCmp, OffCmp); + else if (Pred == ICmpInst::ICMP_NE) + Res = IRB.CreateOr(RsrcCmp, OffCmp); + copyMetadata(Res, &Cmp); + Res->takeName(&Cmp); + SplitUsers.insert(&Cmp); + Cmp.replaceAllUsesWith(Res); + return {nullptr, nullptr}; +} + +PtrParts SplitPtrStructs::visitFreezeInst(FreezeInst &I) { + if (!isSplitFatPtr(I.getType())) + return {nullptr, nullptr}; + IRB.SetInsertPoint(&I); + auto [Rsrc, Off] = getPtrParts(I.getOperand(0)); + + Value *RsrcRes = IRB.CreateFreeze(Rsrc, I.getName() + ".rsrc"); + copyMetadata(RsrcRes, &I); + Value *OffRes = IRB.CreateFreeze(Off, I.getName() + ".off"); + copyMetadata(OffRes, &I); + SplitUsers.insert(&I); + return {RsrcRes, OffRes}; +} + +PtrParts SplitPtrStructs::visitExtractElementInst(ExtractElementInst &I) { + if (!isSplitFatPtr(I.getType())) + return {nullptr, nullptr}; + IRB.SetInsertPoint(&I); + Value *Vec = I.getVectorOperand(); + Value *Idx = I.getIndexOperand(); + auto [Rsrc, Off] = getPtrParts(Vec); + + Value *RsrcRes = IRB.CreateExtractElement(Rsrc, Idx, I.getName() + ".rsrc"); + copyMetadata(RsrcRes, &I); + Value *OffRes = IRB.CreateExtractElement(Off, Idx, I.getName() + ".off"); + copyMetadata(OffRes, &I); + SplitUsers.insert(&I); + return {RsrcRes, OffRes}; +} + +PtrParts SplitPtrStructs::visitInsertElementInst(InsertElementInst &I) { + // The mutated instructions temporarily don't return vectors, and so + // we need the generic getType() here to avoid crashes. + if (!isSplitFatPtr(cast(I).getType())) + return {nullptr, nullptr}; + IRB.SetInsertPoint(&I); + Value *Vec = I.getOperand(0); + Value *Elem = I.getOperand(1); + Value *Idx = I.getOperand(2); + auto [VecRsrc, VecOff] = getPtrParts(Vec); + auto [ElemRsrc, ElemOff] = getPtrParts(Elem); + + Value *RsrcRes = + IRB.CreateInsertElement(VecRsrc, ElemRsrc, Idx, I.getName() + ".rsrc"); + copyMetadata(RsrcRes, &I); + Value *OffRes = + IRB.CreateInsertElement(VecOff, ElemOff, Idx, I.getName() + ".off"); + copyMetadata(OffRes, &I); + SplitUsers.insert(&I); + return {RsrcRes, OffRes}; +} + +PtrParts SplitPtrStructs::visitShuffleVectorInst(ShuffleVectorInst &I) { + // Cast is needed for the same reason as insertelement's. + if (!isSplitFatPtr(cast(I).getType())) + return {nullptr, nullptr}; + IRB.SetInsertPoint(&I); + + Value *V1 = I.getOperand(0); + Value *V2 = I.getOperand(1); + ArrayRef Mask = I.getShuffleMask(); + auto [V1Rsrc, V1Off] = getPtrParts(V1); + auto [V2Rsrc, V2Off] = getPtrParts(V2); + + Value *RsrcRes = + IRB.CreateShuffleVector(V1Rsrc, V2Rsrc, Mask, I.getName() + ".rsrc"); + copyMetadata(RsrcRes, &I); + Value *OffRes = + IRB.CreateShuffleVector(V1Off, V2Off, Mask, I.getName() + ".off"); + copyMetadata(OffRes, &I); + SplitUsers.insert(&I); + return {RsrcRes, OffRes}; +} + +PtrParts SplitPtrStructs::visitPHINode(PHINode &PHI) { + if (!isSplitFatPtr(PHI.getType())) + return {nullptr, nullptr}; + IRB.SetInsertPoint(PHI.getInsertionPointAfterDef()); + // Phi nodes will be handled in post-processing after we've visited every + // instruction. However, instead of just returning {nullptr, nullptr}, + // we explicitly create the temporary extractvalue operations that are our + // temporary results so that they end up at the beginning of the block with + // the PHIs. + Value *TmpRsrc = IRB.CreateExtractValue(&PHI, 0, PHI.getName() + ".rsrc"); + Value *TmpOff = IRB.CreateExtractValue(&PHI, 1, PHI.getName() + ".off"); + Conditionals.push_back(&PHI); + SplitUsers.insert(&PHI); + return {TmpRsrc, TmpOff}; +} + +PtrParts SplitPtrStructs::visitSelectInst(SelectInst &SI) { + if (!isSplitFatPtr(SI.getType())) + return {nullptr, nullptr}; + IRB.SetInsertPoint(&SI); + + Value *Cond = SI.getCondition(); + Value *True = SI.getTrueValue(); + Value *False = SI.getFalseValue(); + auto [TrueRsrc, TrueOff] = getPtrParts(True); + auto [FalseRsrc, FalseOff] = getPtrParts(False); + + Value *RsrcRes = RsrcRes = + IRB.CreateSelect(Cond, TrueRsrc, FalseRsrc, SI.getName() + ".rsrc", &SI); + copyMetadata(RsrcRes, &SI); + Conditionals.push_back(&SI); + Value *OffRes = + IRB.CreateSelect(Cond, TrueOff, FalseOff, SI.getName() + ".off", &SI); + copyMetadata(OffRes, &SI); + SplitUsers.insert(&SI); + return {RsrcRes, OffRes}; +} + +PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) { + Intrinsic::ID IID = I.getIntrinsicID(); + switch (IID) { + default: + break; + case Intrinsic::ptrmask: { + Value *Ptr = I.getArgOperand(0); + if (!isSplitFatPtr(Ptr->getType())) + return {nullptr, nullptr}; + Value *Mask = I.getArgOperand(1); + IRB.SetInsertPoint(&I); + const DataLayout &DL = I.getModule()->getDataLayout(); + auto [Rsrc, Off] = getPtrParts(Ptr); + Type *RsrcTy = Rsrc->getType(); + unsigned RsrcPtrWidth = DL.getPointerTypeSizeInBits(RsrcTy); + unsigned Width = Mask->getType()->getScalarSizeInBits(); + + Value *RsrcMask = + IRB.CreateLShr(Mask, IRB.getInt(APInt(Width, BufferOffsetWidth))); + RsrcMask = IRB.CreateIntCast(RsrcMask, IRB.getIntNTy(RsrcPtrWidth), + /*IsSigned=*/false); + Value *OffMask = IRB.CreateIntCast(Mask, IRB.getIntNTy(BufferOffsetWidth), + /*IsSigned=*/false); + Value *RsrcRes = + IRB.CreateIntrinsic(IID, {RsrcTy, IRB.getIntNTy(RsrcPtrWidth)}, + {Rsrc, RsrcMask}, nullptr, I.getName() + ".rsrc"); + copyMetadata(RsrcRes, &I); + Value *OffRes = IRB.CreateAnd(Off, OffMask, I.getName() + ".off"); + copyMetadata(OffRes, &I); + SplitUsers.insert(&I); + IntrinsicDeclsToRemove.insert(I.getCalledFunction()); + return {RsrcRes, OffRes}; + } + // Pointer annotation intrinsics that, given their object-wide nature + // operate on the resource part. + case Intrinsic::invariant_start: { + Value *Ptr = I.getArgOperand(1); + if (!isSplitFatPtr(Ptr->getType())) + return {nullptr, nullptr}; + IRB.SetInsertPoint(&I); + auto [Rsrc, Off] = getPtrParts(Ptr); + Type *NewTy = PointerType::get(I.getContext(), AMDGPUAS::BUFFER_RESOURCE); + auto *NewRsrc = IRB.CreateIntrinsic(IID, {NewTy}, {I.getOperand(0), Rsrc}); + copyMetadata(NewRsrc, &I); + NewRsrc->takeName(&I); + SplitUsers.insert(&I); + I.replaceAllUsesWith(NewRsrc); + IntrinsicDeclsToRemove.insert(I.getCalledFunction()); + return {nullptr, nullptr}; + } + case Intrinsic::invariant_end: { + Value *RealPtr = I.getArgOperand(2); + if (!isSplitFatPtr(RealPtr->getType())) + return {nullptr, nullptr}; + IRB.SetInsertPoint(&I); + Value *RealRsrc = getPtrParts(RealPtr).first; + Value *InvPtr = I.getArgOperand(0); + Value *Size = I.getArgOperand(1); + Value *NewRsrc = IRB.CreateIntrinsic(IID, {RealRsrc->getType()}, + {InvPtr, Size, RealRsrc}); + copyMetadata(NewRsrc, &I); + NewRsrc->takeName(&I); + SplitUsers.insert(&I); + I.replaceAllUsesWith(NewRsrc); + IntrinsicDeclsToRemove.insert(I.getCalledFunction()); + return {nullptr, nullptr}; + } + case Intrinsic::launder_invariant_group: + case Intrinsic::strip_invariant_group: { + Value *Ptr = I.getArgOperand(0); + if (!isSplitFatPtr(Ptr->getType())) + return {nullptr, nullptr}; + IRB.SetInsertPoint(&I); + auto [Rsrc, Off] = getPtrParts(Ptr); + Value *NewRsrc = IRB.CreateIntrinsic(IID, {Rsrc->getType()}, {Rsrc}); + copyMetadata(NewRsrc, &I); + NewRsrc->takeName(&I); + SplitUsers.insert(&I); + IntrinsicDeclsToRemove.insert(I.getCalledFunction()); + return {NewRsrc, Off}; + } + } + return {nullptr, nullptr}; +} + +void SplitPtrStructs::processFunction(Function &F) { + ST = &TM->getSubtarget(F); + SmallVector Originals; + LLVM_DEBUG(llvm::dbgs() << "Splitting pointer structs in function: " + << F.getName() << "\n"); + for (Instruction &I : instructions(F)) + Originals.push_back(&I); + for (Instruction *I : Originals) { + auto [Rsrc, Off] = visit(I); + assert((Rsrc && Off) || + (!Rsrc && !Off) && "Can't have a resource but no offset"); + if (Rsrc) + RsrcParts[I] = Rsrc; + if (Off) + OffParts[I] = Off; + } + processConditionals(); + killAndReplaceSplitInstructions(Originals); + + // Clean up after ourselves to save on memory. + RsrcParts.clear(); + OffParts.clear(); + SplitUsers.clear(); + Conditionals.clear(); + ConditionalTemps.clear(); +} + +namespace { +class AMDGPULowerBufferFatPointers : public ModulePass { +public: + static char ID; + + AMDGPULowerBufferFatPointers() : ModulePass(ID) { + initializeAMDGPULowerBufferFatPointersPass( + *PassRegistry::getPassRegistry()); + } + + bool run(Module &M, const TargetMachine &TM); + bool runOnModule(Module &M) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; +}; +} // namespace + +/// Returns true if there are values that have a buffer fat pointer in them, +/// which means we'll need to perform rewrites on this function. As a side +/// effect, this will populate the type remapping cache. +static bool containsBufferFatPointers(const Function &F, + BufferFatPtrToStructTypeMap *TypeMap) { + bool HasFatPointers = false; + for (const BasicBlock &BB : F) { + for (const Instruction &I : BB) { + HasFatPointers |= (I.getType() != TypeMap->remapType(I.getType())); + for (const Use &U : I.operands()) + if (auto *C = dyn_cast(U.get())) + HasFatPointers |= isBufferFatPtrConst(C); + } + } + return HasFatPointers; +} + +static bool hasFatPointerInterface(const Function &F, + BufferFatPtrToStructTypeMap *TypeMap) { + Type *Ty = F.getFunctionType(); + return Ty != TypeMap->remapType(Ty); +} + +/// Move the body of `OldF` into a new function, returning it. +static Function *moveFunctionAdaptingType(Function *OldF, FunctionType *NewTy, + ValueToValueMapTy &CloneMap) { + bool IsIntrinsic = OldF->isIntrinsic(); + Function *NewF = + Function::Create(NewTy, OldF->getLinkage(), OldF->getAddressSpace()); + NewF->copyAttributesFrom(OldF); + NewF->copyMetadata(OldF, 0); + NewF->takeName(OldF); + NewF->recalculateIntrinsicID(); + NewF->setDLLStorageClass(OldF->getDLLStorageClass()); + OldF->getParent()->getFunctionList().insertAfter(OldF->getIterator(), NewF); + + while (!OldF->empty()) { + BasicBlock *BB = &OldF->front(); + BB->removeFromParent(); + BB->insertInto(NewF); + CloneMap[BB] = BB; + for (Instruction &I : *BB) { + CloneMap[&I] = &I; + } + } + + AttributeMask PtrOnlyAttrs; + for (auto K : + {Attribute::Dereferenceable, Attribute::DereferenceableOrNull, + Attribute::NoAlias, Attribute::NoCapture, Attribute::NoFree, + Attribute::NonNull, Attribute::NullPointerIsValid, Attribute::ReadNone, + Attribute::ReadOnly, Attribute::WriteOnly}) { + PtrOnlyAttrs.addAttribute(K); + } + SmallVector ArgAttrs; + AttributeList OldAttrs = OldF->getAttributes(); + + for (auto [I, OldArg, NewArg] : llvm::enumerate(OldF->args(), NewF->args())) { + CloneMap[&NewArg] = &OldArg; + NewArg.takeName(&OldArg); + Type *OldArgTy = OldArg.getType(), *NewArgTy = NewArg.getType(); + // Temporarily mutate type of `NewArg` to allow RAUW to work. + NewArg.mutateType(OldArgTy); + OldArg.replaceAllUsesWith(&NewArg); + NewArg.mutateType(NewArgTy); + + AttributeSet ArgAttr = OldAttrs.getParamAttrs(I); + // Intrinsics get their attributes fixed later. + if (OldArgTy != NewArgTy && !IsIntrinsic) + ArgAttr = ArgAttr.removeAttributes(NewF->getContext(), PtrOnlyAttrs); + ArgAttrs.push_back(ArgAttr); + } + AttributeSet RetAttrs = OldAttrs.getRetAttrs(); + if (OldF->getReturnType() != NewF->getReturnType() && !IsIntrinsic) + RetAttrs = RetAttrs.removeAttributes(NewF->getContext(), PtrOnlyAttrs); + NewF->setAttributes(AttributeList::get( + NewF->getContext(), OldAttrs.getFnAttrs(), RetAttrs, ArgAttrs)); + return NewF; +} + +static void makeCloneInPraceMap(Function *F, ValueToValueMapTy &CloneMap) { + for (Argument &A : F->args()) + CloneMap[&A] = &A; + for (BasicBlock &BB : *F) { + CloneMap[&BB] = &BB; + for (Instruction &I : BB) + CloneMap[&I] = &I; + } +} + +bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) { + bool Changed = false; + const DataLayout &DL = M.getDataLayout(); + // Record the functions which need to be remapped. + // The second element of the pair indicates whether the function has to have + // its arguments or return types adjusted. + SmallVector> NeedsRemap; + + BufferFatPtrToStructTypeMap StructTM(DL); + BufferFatPtrToIntTypeMap IntTM(DL); + for (const GlobalVariable &GV : M.globals()) { + if (GV.getAddressSpace() == AMDGPUAS::BUFFER_FAT_POINTER) + report_fatal_error("Global variables with a buffer fat pointer address " + "space (7) are not supported"); + Type *VT = GV.getValueType(); + if (VT != StructTM.remapType(VT)) + report_fatal_error("Global variables that contain buffer fat pointers " + "(address space 7 pointers) are unsupported. Use " + "buffer resource pointers (address space 8) instead."); + } + + StoreFatPtrsAsIntsVisitor MemOpsRewrite(&IntTM, M.getContext()); + for (Function &F : M.functions()) { + bool InterfaceChange = hasFatPointerInterface(F, &StructTM); + bool BodyChanges = containsBufferFatPointers(F, &StructTM); + Changed |= MemOpsRewrite.processFunction(F); + if (InterfaceChange || BodyChanges) + NeedsRemap.push_back(std::make_pair(&F, InterfaceChange)); + } + if (NeedsRemap.empty()) + return Changed; + + SmallVector NeedsPostProcess; + SmallVector Intrinsics; + // Keep one big map so as to memoize constants across functions. + ValueToValueMapTy CloneMap; + FatPtrConstMaterializer Materializer(&StructTM, CloneMap, &IntTM, DL); + + ValueMapper LowerInFuncs(CloneMap, RF_None, &StructTM, &Materializer); + for (auto [F, InterfaceChange] : NeedsRemap) { + Function *NewF = F; + if (InterfaceChange) + NewF = moveFunctionAdaptingType( + F, cast(StructTM.remapType(F->getFunctionType())), + CloneMap); + else + makeCloneInPraceMap(F, CloneMap); + LowerInFuncs.remapFunction(*NewF); + if (NewF->isIntrinsic()) + Intrinsics.push_back(NewF); + else + NeedsPostProcess.push_back(NewF); + if (InterfaceChange) { + F->replaceAllUsesWith(NewF); + F->eraseFromParent(); + } + Changed = true; + } + StructTM.clear(); + IntTM.clear(); + CloneMap.clear(); + + SplitPtrStructs Splitter(M.getContext(), &TM); + for (Function *F : NeedsPostProcess) + Splitter.processFunction(*F); + for (Function *F : Intrinsics) { + if (Splitter.IntrinsicDeclsToRemove.contains(F)) { + F->eraseFromParent(); + } else { + std::optional NewF = Intrinsic::remangleIntrinsicFunction(F); + if (NewF) + F->replaceAllUsesWith(*NewF); + } + } + return Changed; +} + +bool AMDGPULowerBufferFatPointers::runOnModule(Module &M) { + TargetPassConfig &TPC = getAnalysis(); + const TargetMachine &TM = TPC.getTM(); + return run(M, TM); +} + +char AMDGPULowerBufferFatPointers::ID = 0; + +char &llvm::AMDGPULowerBufferFatPointersID = AMDGPULowerBufferFatPointers::ID; + +void AMDGPULowerBufferFatPointers::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); +} + +#define PASS_DESC "Lower buffer fat pointer operations to buffer resources" +INITIALIZE_PASS_BEGIN(AMDGPULowerBufferFatPointers, DEBUG_TYPE, PASS_DESC, + false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(AMDGPULowerBufferFatPointers, DEBUG_TYPE, PASS_DESC, false, + false) +#undef PASS_DESC + +ModulePass *llvm::createAMDGPULowerBufferFatPointersPass() { + return new AMDGPULowerBufferFatPointers(); +} + +PreservedAnalyses +AMDGPULowerBufferFatPointersPass::run(Module &M, ModuleAnalysisManager &MA) { + return AMDGPULowerBufferFatPointers().run(M, TM) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -389,6 +389,7 @@ initializeAMDGPULateCodeGenPreparePass(*PR); initializeAMDGPURemoveIncompatibleFunctionsPass(*PR); initializeAMDGPULowerModuleLDSLegacyPass(*PR); + initializeAMDGPULowerBufferFatPointersPass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); initializeAMDGPURewriteUndefForPHILegacyPass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); @@ -613,6 +614,10 @@ PM.addPass(AMDGPULowerModuleLDSPass(*this)); return true; } + if (PassName == "amdgpu-lower-buffer-fat-pointers") { + PM.addPass(AMDGPULowerBufferFatPointersPass(*this)); + return true; + } if (PassName == "amdgpu-lower-ctor-dtor") { PM.addPass(AMDGPUCtorDtorLoweringPass()); return true; @@ -1073,6 +1078,17 @@ if (isPassEnabled(EnableLoadStoreVectorizer)) addPass(createLoadStoreVectorizerPass()); + if (TM->getTargetTriple().getArch() == Triple::amdgcn) { + // This lowering has been placed after codegenprepare to take advantage of + // address mode matching (which is why it isn't put with the LDS lowerings). + // It could be placed anywhere before uniformity annotations (an analysis + // that it changes by splitting up fat pointers into their components) + // but has been put before switch lowering and CFG flattening so that those + // passes can run on the more optimized control flow this pass creates in + // many cases. + addPass(createAMDGPULowerBufferFatPointersPass()); + } + // LowerSwitch pass may introduce unreachable blocks that can // cause unexpected behavior for subsequent passes. Placing it // here seems better that these blocks would get cleaned up by diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -67,6 +67,7 @@ AMDGPULegalizerInfo.cpp AMDGPULibCalls.cpp AMDGPULibFunc.cpp + AMDGPULowerBufferFatPointers.cpp AMDGPULowerKernelArguments.cpp AMDGPULowerKernelAttributes.cpp AMDGPULowerModuleLDSPass.cpp diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -14418,7 +14418,8 @@ if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy())) return AtomicExpansionKind::CmpXChg; - if (AMDGPU::isFlatGlobalAddrSpace(AS) && + if ((AMDGPU::isFlatGlobalAddrSpace(AS) || + AS == AMDGPUAS::BUFFER_FAT_POINTER) && Subtarget->hasAtomicFaddNoRtnInsts()) { if (Subtarget->hasGFX940Insts()) return AtomicExpansionKind::None; @@ -14430,11 +14431,13 @@ if (HasSystemScope) return AtomicExpansionKind::CmpXChg; - if (AS == AMDGPUAS::GLOBAL_ADDRESS && Ty->isFloatTy()) { - // global atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+. + if ((AS == AMDGPUAS::GLOBAL_ADDRESS || + AS == AMDGPUAS::BUFFER_FAT_POINTER) && + Ty->isFloatTy()) { + // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+. if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) return ReportUnsafeHWInst(AtomicExpansionKind::None); - // global atomic fadd f32 rtn: gfx90a, gfx940, gfx11+. + // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+. if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) return ReportUnsafeHWInst(AtomicExpansionKind::None); } @@ -14489,7 +14492,8 @@ case AtomicRMWInst::Max: case AtomicRMWInst::UMin: case AtomicRMWInst::UMax: { - if (AMDGPU::isFlatGlobalAddrSpace(AS)) { + if (AMDGPU::isFlatGlobalAddrSpace(AS) || + AS == AMDGPUAS::BUFFER_FAT_POINTER) { if (RMW->getType()->isFloatTy() && unsafeFPAtomicsDisabled(RMW->getFunction())) return AtomicExpansionKind::CmpXChg; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces-vectors.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces-vectors.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces-vectors.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces-vectors.ll @@ -1,15 +1,79 @@ -; RUN: not --crash llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - -stop-after=irtranslator < %s -; REQUIRES: asserts - -; Confirm that no one's gotten vectors of addrspace(7) pointers to go through the -; IR translater incidentally. +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - -stop-after=irtranslator < %s | FileCheck %s define <2 x ptr addrspace(7)> @no_auto_constfold_gep_vector() { + ; CHECK-LABEL: name: no_auto_constfold_gep_vector + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: [[C:%[0-9]+]]:_(p8) = G_CONSTANT i128 0 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p8>) = G_BUILD_VECTOR [[C]](p8), [[C]](p8) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 123 + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x p8>) + ; CHECK-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR1]](<2 x s32>) + ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) + ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; CHECK-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; CHECK-NEXT: $vgpr4 = COPY [[UV4]](s32) + ; CHECK-NEXT: $vgpr5 = COPY [[UV5]](s32) + ; CHECK-NEXT: $vgpr6 = COPY [[UV6]](s32) + ; CHECK-NEXT: $vgpr7 = COPY [[UV7]](s32) + ; CHECK-NEXT: $vgpr8 = COPY [[UV8]](s32) + ; CHECK-NEXT: $vgpr9 = COPY [[UV9]](s32) + ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9 %gep = getelementptr i8, <2 x ptr addrspace(7)> zeroinitializer, <2 x i32> ret <2 x ptr addrspace(7)> %gep } define <2 x ptr addrspace(7)> @gep_vector_splat(<2 x ptr addrspace(7)> %ptrs, i64 %idx) { + ; CHECK-LABEL: name: gep_vector_splat + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p8) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(p8) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p8>) = G_BUILD_VECTOR [[MV]](p8), [[MV1]](p8) + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; CHECK-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<2 x s64>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C2]](s32), [[C2]](s32) + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<2 x p8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV2]](s64), [[C]](s64) + ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<2 x s64>) = G_SHUFFLE_VECTOR [[IVEC]](<2 x s64>), [[DEF]], shufflemask(0, 0) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s32>) = G_TRUNC [[SHUF]](<2 x s64>) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(<2 x s32>) = G_SHL [[TRUNC]], [[BUILD_VECTOR2]](<2 x s32>) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<2 x s32>) = G_ADD [[SHL]], [[BUILD_VECTOR3]] + ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(<2 x s32>) = G_ADD [[BUILD_VECTOR1]], [[ADD]] + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x p8>) + ; CHECK-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ADD1]](<2 x s32>) + ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) + ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; CHECK-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; CHECK-NEXT: $vgpr4 = COPY [[UV4]](s32) + ; CHECK-NEXT: $vgpr5 = COPY [[UV5]](s32) + ; CHECK-NEXT: $vgpr6 = COPY [[UV6]](s32) + ; CHECK-NEXT: $vgpr7 = COPY [[UV7]](s32) + ; CHECK-NEXT: $vgpr8 = COPY [[UV8]](s32) + ; CHECK-NEXT: $vgpr9 = COPY [[UV9]](s32) + ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9 %gep = getelementptr i8, <2 x ptr addrspace(7)> %ptrs, i64 %idx ret <2 x ptr addrspace(7)> %gep } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces.ll @@ -5,15 +5,14 @@ define ptr addrspace(7) @no_auto_constfold_gep() { ; CHECK-LABEL: name: no_auto_constfold_gep ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: [[C:%[0-9]+]]:_(p7) = G_CONSTANT i160 0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(p8) = G_CONSTANT i128 0 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 123 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p7) = G_PTR_ADD [[C]], [[C1]](s32) - ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[PTR_ADD]](p7) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C]](p8) ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) ; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32) ; CHECK-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; CHECK-NEXT: $vgpr4 = COPY [[UV4]](s32) + ; CHECK-NEXT: $vgpr4 = COPY [[C1]](s32) ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4 %gep = getelementptr i8, ptr addrspace(7) null, i32 123 ret ptr addrspace(7) %gep diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -51,39 +51,43 @@ ; GCN-O0-NEXT: AMDGPU Annotate Kernel Features ; GCN-O0-NEXT: FunctionPass Manager ; GCN-O0-NEXT: AMDGPU Lower Kernel Arguments -; GCN-O0-NEXT: Lazy Value Information Analysis -; GCN-O0-NEXT: Lower SwitchInst's to branches -; GCN-O0-NEXT: Lower invoke and unwind, for unwindless code generators -; GCN-O0-NEXT: Remove unreachable blocks from the CFG -; GCN-O0-NEXT: Post-Dominator Tree Construction -; GCN-O0-NEXT: Dominator Tree Construction -; GCN-O0-NEXT: Cycle Info Analysis -; GCN-O0-NEXT: Uniformity Analysis -; GCN-O0-NEXT: Unify divergent function exit nodes -; GCN-O0-NEXT: Lazy Value Information Analysis -; GCN-O0-NEXT: Lower SwitchInst's to branches -; GCN-O0-NEXT: Dominator Tree Construction -; GCN-O0-NEXT: Natural Loop Information -; GCN-O0-NEXT: Convert irreducible control-flow into natural loops -; GCN-O0-NEXT: Fixup each natural loop to have a single exit block -; GCN-O0-NEXT: Post-Dominator Tree Construction -; GCN-O0-NEXT: Dominance Frontier Construction -; GCN-O0-NEXT: Detect single entry single exit regions -; GCN-O0-NEXT: Region Pass Manager -; GCN-O0-NEXT: Structurize control flow -; GCN-O0-NEXT: Cycle Info Analysis -; GCN-O0-NEXT: Uniformity Analysis -; GCN-O0-NEXT: Basic Alias Analysis (stateless AA impl) -; GCN-O0-NEXT: Function Alias Analysis Results -; GCN-O0-NEXT: Memory SSA -; GCN-O0-NEXT: AMDGPU Annotate Uniform Values -; GCN-O0-NEXT: Natural Loop Information -; GCN-O0-NEXT: SI annotate control flow -; GCN-O0-NEXT: Cycle Info Analysis -; GCN-O0-NEXT: Uniformity Analysis -; GCN-O0-NEXT: AMDGPU Rewrite Undef for PHI -; GCN-O0-NEXT: LCSSA Verifier -; GCN-O0-NEXT: Loop-Closed SSA Form Pass +; GCN-O0-NEXT: Lower buffer fat pointer operations to buffer resources +; GCN-O0-NEXT: FunctionPass Manager +; GCN-O0-NEXT: Lazy Value Information Analysis +; GCN-O0-NEXT: Lower SwitchInst's to branches +; GCN-O0-NEXT: Lower invoke and unwind, for unwindless code generators +; GCN-O0-NEXT: Remove unreachable blocks from the CFG +; GCN-O0-NEXT: Post-Dominator Tree Construction +; GCN-O0-NEXT: Dominator Tree Construction +; GCN-O0-NEXT: Cycle Info Analysis +; GCN-O0-NEXT: Uniformity Analysis +; GCN-O0-NEXT: Unify divergent function exit nodes +; GCN-O0-NEXT: Lazy Value Information Analysis +; GCN-O0-NEXT: Lower SwitchInst's to branches +; GCN-O0-NEXT: Dominator Tree Construction +; GCN-O0-NEXT: Natural Loop Information +; GCN-O0-NEXT: Convert irreducible control-flow into natural loops +; GCN-O0-NEXT: Fixup each natural loop to have a single exit block +; GCN-O0-NEXT: Post-Dominator Tree Construction +; GCN-O0-NEXT: Dominance Frontier Construction +; GCN-O0-NEXT: Detect single entry single exit regions +; GCN-O0-NEXT: Region Pass Manager +; GCN-O0-NEXT: Structurize control flow +; GCN-O0-NEXT: Cycle Info Analysis +; GCN-O0-NEXT: Uniformity Analysis +; GCN-O0-NEXT: Basic Alias Analysis (stateless AA impl) +; GCN-O0-NEXT: Function Alias Analysis Results +; GCN-O0-NEXT: Memory SSA +; GCN-O0-NEXT: AMDGPU Annotate Uniform Values +; GCN-O0-NEXT: Natural Loop Information +; GCN-O0-NEXT: SI annotate control flow +; GCN-O0-NEXT: Cycle Info Analysis +; GCN-O0-NEXT: Uniformity Analysis +; GCN-O0-NEXT: AMDGPU Rewrite Undef for PHI +; GCN-O0-NEXT: LCSSA Verifier +; GCN-O0-NEXT: Loop-Closed SSA Form Pass +; GCN-O0-NEXT: CallGraph Construction +; GCN-O0-NEXT: Call Graph SCC Pass Manager ; GCN-O0-NEXT: DummyCGSCCPass ; GCN-O0-NEXT: FunctionPass Manager ; GCN-O0-NEXT: Prepare callbr @@ -233,48 +237,52 @@ ; GCN-O1-NEXT: Dominator Tree Construction ; GCN-O1-NEXT: Natural Loop Information ; GCN-O1-NEXT: CodeGen Prepare -; GCN-O1-NEXT: Lazy Value Information Analysis -; GCN-O1-NEXT: Lower SwitchInst's to branches -; GCN-O1-NEXT: Lower invoke and unwind, for unwindless code generators -; GCN-O1-NEXT: Remove unreachable blocks from the CFG -; GCN-O1-NEXT: Dominator Tree Construction -; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) -; GCN-O1-NEXT: Function Alias Analysis Results -; GCN-O1-NEXT: Flatten the CFG -; GCN-O1-NEXT: Dominator Tree Construction -; GCN-O1-NEXT: Cycle Info Analysis -; GCN-O1-NEXT: Uniformity Analysis -; GCN-O1-NEXT: AMDGPU IR late optimizations -; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) -; GCN-O1-NEXT: Function Alias Analysis Results -; GCN-O1-NEXT: Natural Loop Information -; GCN-O1-NEXT: Code sinking -; GCN-O1-NEXT: Post-Dominator Tree Construction -; GCN-O1-NEXT: Unify divergent function exit nodes -; GCN-O1-NEXT: Lazy Value Information Analysis -; GCN-O1-NEXT: Lower SwitchInst's to branches -; GCN-O1-NEXT: Dominator Tree Construction -; GCN-O1-NEXT: Natural Loop Information -; GCN-O1-NEXT: Convert irreducible control-flow into natural loops -; GCN-O1-NEXT: Fixup each natural loop to have a single exit block -; GCN-O1-NEXT: Post-Dominator Tree Construction -; GCN-O1-NEXT: Dominance Frontier Construction -; GCN-O1-NEXT: Detect single entry single exit regions -; GCN-O1-NEXT: Region Pass Manager -; GCN-O1-NEXT: Structurize control flow -; GCN-O1-NEXT: Cycle Info Analysis -; GCN-O1-NEXT: Uniformity Analysis -; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) -; GCN-O1-NEXT: Function Alias Analysis Results -; GCN-O1-NEXT: Memory SSA -; GCN-O1-NEXT: AMDGPU Annotate Uniform Values -; GCN-O1-NEXT: Natural Loop Information -; GCN-O1-NEXT: SI annotate control flow -; GCN-O1-NEXT: Cycle Info Analysis -; GCN-O1-NEXT: Uniformity Analysis -; GCN-O1-NEXT: AMDGPU Rewrite Undef for PHI -; GCN-O1-NEXT: LCSSA Verifier -; GCN-O1-NEXT: Loop-Closed SSA Form Pass +; GCN-O1-NEXT: Lower buffer fat pointer operations to buffer resources +; GCN-O1-NEXT: FunctionPass Manager +; GCN-O1-NEXT: Lazy Value Information Analysis +; GCN-O1-NEXT: Lower SwitchInst's to branches +; GCN-O1-NEXT: Lower invoke and unwind, for unwindless code generators +; GCN-O1-NEXT: Remove unreachable blocks from the CFG +; GCN-O1-NEXT: Dominator Tree Construction +; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) +; GCN-O1-NEXT: Function Alias Analysis Results +; GCN-O1-NEXT: Flatten the CFG +; GCN-O1-NEXT: Dominator Tree Construction +; GCN-O1-NEXT: Cycle Info Analysis +; GCN-O1-NEXT: Uniformity Analysis +; GCN-O1-NEXT: AMDGPU IR late optimizations +; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) +; GCN-O1-NEXT: Function Alias Analysis Results +; GCN-O1-NEXT: Natural Loop Information +; GCN-O1-NEXT: Code sinking +; GCN-O1-NEXT: Post-Dominator Tree Construction +; GCN-O1-NEXT: Unify divergent function exit nodes +; GCN-O1-NEXT: Lazy Value Information Analysis +; GCN-O1-NEXT: Lower SwitchInst's to branches +; GCN-O1-NEXT: Dominator Tree Construction +; GCN-O1-NEXT: Natural Loop Information +; GCN-O1-NEXT: Convert irreducible control-flow into natural loops +; GCN-O1-NEXT: Fixup each natural loop to have a single exit block +; GCN-O1-NEXT: Post-Dominator Tree Construction +; GCN-O1-NEXT: Dominance Frontier Construction +; GCN-O1-NEXT: Detect single entry single exit regions +; GCN-O1-NEXT: Region Pass Manager +; GCN-O1-NEXT: Structurize control flow +; GCN-O1-NEXT: Cycle Info Analysis +; GCN-O1-NEXT: Uniformity Analysis +; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) +; GCN-O1-NEXT: Function Alias Analysis Results +; GCN-O1-NEXT: Memory SSA +; GCN-O1-NEXT: AMDGPU Annotate Uniform Values +; GCN-O1-NEXT: Natural Loop Information +; GCN-O1-NEXT: SI annotate control flow +; GCN-O1-NEXT: Cycle Info Analysis +; GCN-O1-NEXT: Uniformity Analysis +; GCN-O1-NEXT: AMDGPU Rewrite Undef for PHI +; GCN-O1-NEXT: LCSSA Verifier +; GCN-O1-NEXT: Loop-Closed SSA Form Pass +; GCN-O1-NEXT: CallGraph Construction +; GCN-O1-NEXT: Call Graph SCC Pass Manager ; GCN-O1-NEXT: DummyCGSCCPass ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Prepare callbr @@ -519,48 +527,52 @@ ; GCN-O1-OPTS-NEXT: Natural Loop Information ; GCN-O1-OPTS-NEXT: Scalar Evolution Analysis ; GCN-O1-OPTS-NEXT: GPU Load and Store Vectorizer -; GCN-O1-OPTS-NEXT: Lazy Value Information Analysis -; GCN-O1-OPTS-NEXT: Lower SwitchInst's to branches -; GCN-O1-OPTS-NEXT: Lower invoke and unwind, for unwindless code generators -; GCN-O1-OPTS-NEXT: Remove unreachable blocks from the CFG -; GCN-O1-OPTS-NEXT: Dominator Tree Construction -; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl) -; GCN-O1-OPTS-NEXT: Function Alias Analysis Results -; GCN-O1-OPTS-NEXT: Flatten the CFG -; GCN-O1-OPTS-NEXT: Dominator Tree Construction -; GCN-O1-OPTS-NEXT: Cycle Info Analysis -; GCN-O1-OPTS-NEXT: Uniformity Analysis -; GCN-O1-OPTS-NEXT: AMDGPU IR late optimizations -; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl) -; GCN-O1-OPTS-NEXT: Function Alias Analysis Results -; GCN-O1-OPTS-NEXT: Natural Loop Information -; GCN-O1-OPTS-NEXT: Code sinking -; GCN-O1-OPTS-NEXT: Post-Dominator Tree Construction -; GCN-O1-OPTS-NEXT: Unify divergent function exit nodes -; GCN-O1-OPTS-NEXT: Lazy Value Information Analysis -; GCN-O1-OPTS-NEXT: Lower SwitchInst's to branches -; GCN-O1-OPTS-NEXT: Dominator Tree Construction -; GCN-O1-OPTS-NEXT: Natural Loop Information -; GCN-O1-OPTS-NEXT: Convert irreducible control-flow into natural loops -; GCN-O1-OPTS-NEXT: Fixup each natural loop to have a single exit block -; GCN-O1-OPTS-NEXT: Post-Dominator Tree Construction -; GCN-O1-OPTS-NEXT: Dominance Frontier Construction -; GCN-O1-OPTS-NEXT: Detect single entry single exit regions -; GCN-O1-OPTS-NEXT: Region Pass Manager -; GCN-O1-OPTS-NEXT: Structurize control flow -; GCN-O1-OPTS-NEXT: Cycle Info Analysis -; GCN-O1-OPTS-NEXT: Uniformity Analysis -; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl) -; GCN-O1-OPTS-NEXT: Function Alias Analysis Results -; GCN-O1-OPTS-NEXT: Memory SSA -; GCN-O1-OPTS-NEXT: AMDGPU Annotate Uniform Values -; GCN-O1-OPTS-NEXT: Natural Loop Information -; GCN-O1-OPTS-NEXT: SI annotate control flow -; GCN-O1-OPTS-NEXT: Cycle Info Analysis -; GCN-O1-OPTS-NEXT: Uniformity Analysis -; GCN-O1-OPTS-NEXT: AMDGPU Rewrite Undef for PHI -; GCN-O1-OPTS-NEXT: LCSSA Verifier -; GCN-O1-OPTS-NEXT: Loop-Closed SSA Form Pass +; GCN-O1-OPTS-NEXT: Lower buffer fat pointer operations to buffer resources +; GCN-O1-OPTS-NEXT: FunctionPass Manager +; GCN-O1-OPTS-NEXT: Lazy Value Information Analysis +; GCN-O1-OPTS-NEXT: Lower SwitchInst's to branches +; GCN-O1-OPTS-NEXT: Lower invoke and unwind, for unwindless code generators +; GCN-O1-OPTS-NEXT: Remove unreachable blocks from the CFG +; GCN-O1-OPTS-NEXT: Dominator Tree Construction +; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl) +; GCN-O1-OPTS-NEXT: Function Alias Analysis Results +; GCN-O1-OPTS-NEXT: Flatten the CFG +; GCN-O1-OPTS-NEXT: Dominator Tree Construction +; GCN-O1-OPTS-NEXT: Cycle Info Analysis +; GCN-O1-OPTS-NEXT: Uniformity Analysis +; GCN-O1-OPTS-NEXT: AMDGPU IR late optimizations +; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl) +; GCN-O1-OPTS-NEXT: Function Alias Analysis Results +; GCN-O1-OPTS-NEXT: Natural Loop Information +; GCN-O1-OPTS-NEXT: Code sinking +; GCN-O1-OPTS-NEXT: Post-Dominator Tree Construction +; GCN-O1-OPTS-NEXT: Unify divergent function exit nodes +; GCN-O1-OPTS-NEXT: Lazy Value Information Analysis +; GCN-O1-OPTS-NEXT: Lower SwitchInst's to branches +; GCN-O1-OPTS-NEXT: Dominator Tree Construction +; GCN-O1-OPTS-NEXT: Natural Loop Information +; GCN-O1-OPTS-NEXT: Convert irreducible control-flow into natural loops +; GCN-O1-OPTS-NEXT: Fixup each natural loop to have a single exit block +; GCN-O1-OPTS-NEXT: Post-Dominator Tree Construction +; GCN-O1-OPTS-NEXT: Dominance Frontier Construction +; GCN-O1-OPTS-NEXT: Detect single entry single exit regions +; GCN-O1-OPTS-NEXT: Region Pass Manager +; GCN-O1-OPTS-NEXT: Structurize control flow +; GCN-O1-OPTS-NEXT: Cycle Info Analysis +; GCN-O1-OPTS-NEXT: Uniformity Analysis +; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl) +; GCN-O1-OPTS-NEXT: Function Alias Analysis Results +; GCN-O1-OPTS-NEXT: Memory SSA +; GCN-O1-OPTS-NEXT: AMDGPU Annotate Uniform Values +; GCN-O1-OPTS-NEXT: Natural Loop Information +; GCN-O1-OPTS-NEXT: SI annotate control flow +; GCN-O1-OPTS-NEXT: Cycle Info Analysis +; GCN-O1-OPTS-NEXT: Uniformity Analysis +; GCN-O1-OPTS-NEXT: AMDGPU Rewrite Undef for PHI +; GCN-O1-OPTS-NEXT: LCSSA Verifier +; GCN-O1-OPTS-NEXT: Loop-Closed SSA Form Pass +; GCN-O1-OPTS-NEXT: CallGraph Construction +; GCN-O1-OPTS-NEXT: Call Graph SCC Pass Manager ; GCN-O1-OPTS-NEXT: DummyCGSCCPass ; GCN-O1-OPTS-NEXT: FunctionPass Manager ; GCN-O1-OPTS-NEXT: Prepare callbr @@ -821,48 +833,52 @@ ; GCN-O2-NEXT: Natural Loop Information ; GCN-O2-NEXT: Scalar Evolution Analysis ; GCN-O2-NEXT: GPU Load and Store Vectorizer -; GCN-O2-NEXT: Lazy Value Information Analysis -; GCN-O2-NEXT: Lower SwitchInst's to branches -; GCN-O2-NEXT: Lower invoke and unwind, for unwindless code generators -; GCN-O2-NEXT: Remove unreachable blocks from the CFG -; GCN-O2-NEXT: Dominator Tree Construction -; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) -; GCN-O2-NEXT: Function Alias Analysis Results -; GCN-O2-NEXT: Flatten the CFG -; GCN-O2-NEXT: Dominator Tree Construction -; GCN-O2-NEXT: Cycle Info Analysis -; GCN-O2-NEXT: Uniformity Analysis -; GCN-O2-NEXT: AMDGPU IR late optimizations -; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) -; GCN-O2-NEXT: Function Alias Analysis Results -; GCN-O2-NEXT: Natural Loop Information -; GCN-O2-NEXT: Code sinking -; GCN-O2-NEXT: Post-Dominator Tree Construction -; GCN-O2-NEXT: Unify divergent function exit nodes -; GCN-O2-NEXT: Lazy Value Information Analysis -; GCN-O2-NEXT: Lower SwitchInst's to branches -; GCN-O2-NEXT: Dominator Tree Construction -; GCN-O2-NEXT: Natural Loop Information -; GCN-O2-NEXT: Convert irreducible control-flow into natural loops -; GCN-O2-NEXT: Fixup each natural loop to have a single exit block -; GCN-O2-NEXT: Post-Dominator Tree Construction -; GCN-O2-NEXT: Dominance Frontier Construction -; GCN-O2-NEXT: Detect single entry single exit regions -; GCN-O2-NEXT: Region Pass Manager -; GCN-O2-NEXT: Structurize control flow -; GCN-O2-NEXT: Cycle Info Analysis -; GCN-O2-NEXT: Uniformity Analysis -; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) -; GCN-O2-NEXT: Function Alias Analysis Results -; GCN-O2-NEXT: Memory SSA -; GCN-O2-NEXT: AMDGPU Annotate Uniform Values -; GCN-O2-NEXT: Natural Loop Information -; GCN-O2-NEXT: SI annotate control flow -; GCN-O2-NEXT: Cycle Info Analysis -; GCN-O2-NEXT: Uniformity Analysis -; GCN-O2-NEXT: AMDGPU Rewrite Undef for PHI -; GCN-O2-NEXT: LCSSA Verifier -; GCN-O2-NEXT: Loop-Closed SSA Form Pass +; GCN-O2-NEXT: Lower buffer fat pointer operations to buffer resources +; GCN-O2-NEXT: FunctionPass Manager +; GCN-O2-NEXT: Lazy Value Information Analysis +; GCN-O2-NEXT: Lower SwitchInst's to branches +; GCN-O2-NEXT: Lower invoke and unwind, for unwindless code generators +; GCN-O2-NEXT: Remove unreachable blocks from the CFG +; GCN-O2-NEXT: Dominator Tree Construction +; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) +; GCN-O2-NEXT: Function Alias Analysis Results +; GCN-O2-NEXT: Flatten the CFG +; GCN-O2-NEXT: Dominator Tree Construction +; GCN-O2-NEXT: Cycle Info Analysis +; GCN-O2-NEXT: Uniformity Analysis +; GCN-O2-NEXT: AMDGPU IR late optimizations +; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) +; GCN-O2-NEXT: Function Alias Analysis Results +; GCN-O2-NEXT: Natural Loop Information +; GCN-O2-NEXT: Code sinking +; GCN-O2-NEXT: Post-Dominator Tree Construction +; GCN-O2-NEXT: Unify divergent function exit nodes +; GCN-O2-NEXT: Lazy Value Information Analysis +; GCN-O2-NEXT: Lower SwitchInst's to branches +; GCN-O2-NEXT: Dominator Tree Construction +; GCN-O2-NEXT: Natural Loop Information +; GCN-O2-NEXT: Convert irreducible control-flow into natural loops +; GCN-O2-NEXT: Fixup each natural loop to have a single exit block +; GCN-O2-NEXT: Post-Dominator Tree Construction +; GCN-O2-NEXT: Dominance Frontier Construction +; GCN-O2-NEXT: Detect single entry single exit regions +; GCN-O2-NEXT: Region Pass Manager +; GCN-O2-NEXT: Structurize control flow +; GCN-O2-NEXT: Cycle Info Analysis +; GCN-O2-NEXT: Uniformity Analysis +; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) +; GCN-O2-NEXT: Function Alias Analysis Results +; GCN-O2-NEXT: Memory SSA +; GCN-O2-NEXT: AMDGPU Annotate Uniform Values +; GCN-O2-NEXT: Natural Loop Information +; GCN-O2-NEXT: SI annotate control flow +; GCN-O2-NEXT: Cycle Info Analysis +; GCN-O2-NEXT: Uniformity Analysis +; GCN-O2-NEXT: AMDGPU Rewrite Undef for PHI +; GCN-O2-NEXT: LCSSA Verifier +; GCN-O2-NEXT: Loop-Closed SSA Form Pass +; GCN-O2-NEXT: CallGraph Construction +; GCN-O2-NEXT: Call Graph SCC Pass Manager ; GCN-O2-NEXT: Analysis if a function is memory bound ; GCN-O2-NEXT: DummyCGSCCPass ; GCN-O2-NEXT: FunctionPass Manager @@ -1137,48 +1153,52 @@ ; GCN-O3-NEXT: Natural Loop Information ; GCN-O3-NEXT: Scalar Evolution Analysis ; GCN-O3-NEXT: GPU Load and Store Vectorizer -; GCN-O3-NEXT: Lazy Value Information Analysis -; GCN-O3-NEXT: Lower SwitchInst's to branches -; GCN-O3-NEXT: Lower invoke and unwind, for unwindless code generators -; GCN-O3-NEXT: Remove unreachable blocks from the CFG -; GCN-O3-NEXT: Dominator Tree Construction -; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) -; GCN-O3-NEXT: Function Alias Analysis Results -; GCN-O3-NEXT: Flatten the CFG -; GCN-O3-NEXT: Dominator Tree Construction -; GCN-O3-NEXT: Cycle Info Analysis -; GCN-O3-NEXT: Uniformity Analysis -; GCN-O3-NEXT: AMDGPU IR late optimizations -; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) -; GCN-O3-NEXT: Function Alias Analysis Results -; GCN-O3-NEXT: Natural Loop Information -; GCN-O3-NEXT: Code sinking -; GCN-O3-NEXT: Post-Dominator Tree Construction -; GCN-O3-NEXT: Unify divergent function exit nodes -; GCN-O3-NEXT: Lazy Value Information Analysis -; GCN-O3-NEXT: Lower SwitchInst's to branches -; GCN-O3-NEXT: Dominator Tree Construction -; GCN-O3-NEXT: Natural Loop Information -; GCN-O3-NEXT: Convert irreducible control-flow into natural loops -; GCN-O3-NEXT: Fixup each natural loop to have a single exit block -; GCN-O3-NEXT: Post-Dominator Tree Construction -; GCN-O3-NEXT: Dominance Frontier Construction -; GCN-O3-NEXT: Detect single entry single exit regions -; GCN-O3-NEXT: Region Pass Manager -; GCN-O3-NEXT: Structurize control flow -; GCN-O3-NEXT: Cycle Info Analysis -; GCN-O3-NEXT: Uniformity Analysis -; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) -; GCN-O3-NEXT: Function Alias Analysis Results -; GCN-O3-NEXT: Memory SSA -; GCN-O3-NEXT: AMDGPU Annotate Uniform Values -; GCN-O3-NEXT: Natural Loop Information -; GCN-O3-NEXT: SI annotate control flow -; GCN-O3-NEXT: Cycle Info Analysis -; GCN-O3-NEXT: Uniformity Analysis -; GCN-O3-NEXT: AMDGPU Rewrite Undef for PHI -; GCN-O3-NEXT: LCSSA Verifier -; GCN-O3-NEXT: Loop-Closed SSA Form Pass +; GCN-O3-NEXT: Lower buffer fat pointer operations to buffer resources +; GCN-O3-NEXT: FunctionPass Manager +; GCN-O3-NEXT: Lazy Value Information Analysis +; GCN-O3-NEXT: Lower SwitchInst's to branches +; GCN-O3-NEXT: Lower invoke and unwind, for unwindless code generators +; GCN-O3-NEXT: Remove unreachable blocks from the CFG +; GCN-O3-NEXT: Dominator Tree Construction +; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) +; GCN-O3-NEXT: Function Alias Analysis Results +; GCN-O3-NEXT: Flatten the CFG +; GCN-O3-NEXT: Dominator Tree Construction +; GCN-O3-NEXT: Cycle Info Analysis +; GCN-O3-NEXT: Uniformity Analysis +; GCN-O3-NEXT: AMDGPU IR late optimizations +; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) +; GCN-O3-NEXT: Function Alias Analysis Results +; GCN-O3-NEXT: Natural Loop Information +; GCN-O3-NEXT: Code sinking +; GCN-O3-NEXT: Post-Dominator Tree Construction +; GCN-O3-NEXT: Unify divergent function exit nodes +; GCN-O3-NEXT: Lazy Value Information Analysis +; GCN-O3-NEXT: Lower SwitchInst's to branches +; GCN-O3-NEXT: Dominator Tree Construction +; GCN-O3-NEXT: Natural Loop Information +; GCN-O3-NEXT: Convert irreducible control-flow into natural loops +; GCN-O3-NEXT: Fixup each natural loop to have a single exit block +; GCN-O3-NEXT: Post-Dominator Tree Construction +; GCN-O3-NEXT: Dominance Frontier Construction +; GCN-O3-NEXT: Detect single entry single exit regions +; GCN-O3-NEXT: Region Pass Manager +; GCN-O3-NEXT: Structurize control flow +; GCN-O3-NEXT: Cycle Info Analysis +; GCN-O3-NEXT: Uniformity Analysis +; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) +; GCN-O3-NEXT: Function Alias Analysis Results +; GCN-O3-NEXT: Memory SSA +; GCN-O3-NEXT: AMDGPU Annotate Uniform Values +; GCN-O3-NEXT: Natural Loop Information +; GCN-O3-NEXT: SI annotate control flow +; GCN-O3-NEXT: Cycle Info Analysis +; GCN-O3-NEXT: Uniformity Analysis +; GCN-O3-NEXT: AMDGPU Rewrite Undef for PHI +; GCN-O3-NEXT: LCSSA Verifier +; GCN-O3-NEXT: Loop-Closed SSA Form Pass +; GCN-O3-NEXT: CallGraph Construction +; GCN-O3-NEXT: Call Graph SCC Pass Manager ; GCN-O3-NEXT: Analysis if a function is memory bound ; GCN-O3-NEXT: DummyCGSCCPass ; GCN-O3-NEXT: FunctionPass Manager diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -S -mcpu=gfx900 -amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s +; RUN: opt -S -mcpu=gfx900 -passes=amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8" +target triple = "amdgcn--" + +define ptr addrspace(7) @recur.inner.1(ptr addrspace(7) %x, i32 %v) { +; CHECK-LABEL: define { ptr addrspace(8), i32 } @recur.inner.1 +; CHECK-SAME: ({ ptr addrspace(8), i32 } [[X:%.*]], i32 [[V:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: bb: +; CHECK-NEXT: [[X_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[X]], 0 +; CHECK-NEXT: [[X_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[X]], 1 +; CHECK-NEXT: [[ISBASE:%.*]] = icmp sgt i32 [[V]], 0 +; CHECK-NEXT: br i1 [[ISBASE]], label [[RECUR:%.*]], label [[ELSE:%.*]] +; CHECK: recur: +; CHECK-NEXT: [[DEC:%.*]] = sub i32 [[V]], 1 +; CHECK-NEXT: [[INC:%.*]] = call { ptr addrspace(8), i32 } @recur.inner.2(i32 [[DEC]], { ptr addrspace(8), i32 } [[X]]) +; CHECK-NEXT: [[INC_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[INC]], 0 +; CHECK-NEXT: [[INC_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[INC]], 1 +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: else: +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[RET_RSRC:%.*]] = phi ptr addrspace(8) [ [[INC_RSRC]], [[RECUR]] ], [ [[X_RSRC]], [[ELSE]] ] +; CHECK-NEXT: [[RET_OFF:%.*]] = phi i32 [ [[INC_OFF]], [[RECUR]] ], [ [[X_OFF]], [[ELSE]] ] +; CHECK-NEXT: [[TMP0:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[RET_RSRC]], 0 +; CHECK-NEXT: [[RET:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP0]], i32 [[RET_OFF]], 1 +; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[RET]] +; +bb: + %isBase = icmp sgt i32 %v, 0 + br i1 %isBase, label %recur, label %else +recur: + %dec = sub i32 %v, 1 + %inc = call ptr addrspace(7) @recur.inner.2(i32 %dec, ptr addrspace(7) %x) + br label %end +else: + br label %end +end: + %ret = phi ptr addrspace(7) [%inc, %recur], [%x, %else] + ret ptr addrspace(7) %ret +} + +define ptr addrspace(7) @recur.inner.2(i32 %v, ptr addrspace(7) %x) { +; CHECK-LABEL: define { ptr addrspace(8), i32 } @recur.inner.2 +; CHECK-SAME: (i32 [[V:%.*]], { ptr addrspace(8), i32 } [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[X_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[X]], 0 +; CHECK-NEXT: [[X_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[X]], 1 +; CHECK-NEXT: [[INC:%.*]] = add i32 [[X_OFF]], 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[X_RSRC]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[INC]], 1 +; CHECK-NEXT: [[RET:%.*]] = call { ptr addrspace(8), i32 } @recur.inner.1({ ptr addrspace(8), i32 } [[TMP2]], i32 [[V]]) +; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[RET]] +; + %inc = getelementptr i32, ptr addrspace(7) %x, i32 1 + %ret = call ptr addrspace(7) @recur.inner.1(ptr addrspace(7) %inc, i32 %v) + ret ptr addrspace(7) %ret +} + +define void @recur.outer(ptr addrspace(7) %x, ptr %arg) { +; CHECK-LABEL: define void @recur.outer +; CHECK-SAME: ({ ptr addrspace(8), i32 } [[X:%.*]], ptr [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BOUND:%.*]] = load i32, ptr [[ARG]], align 4 +; CHECK-NEXT: [[RET:%.*]] = call { ptr addrspace(8), i32 } @recur.inner.1({ ptr addrspace(8), i32 } [[X]], i32 [[BOUND]]) +; CHECK-NEXT: [[RET_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[RET]], 0 +; CHECK-NEXT: [[RET_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[RET]], 1 +; CHECK-NEXT: [[RET_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[RET_RSRC]] to i160 +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i160 [[RET_INT_RSRC]], 32 +; CHECK-NEXT: [[RET_INT_OFF:%.*]] = zext i32 [[RET_OFF]] to i160 +; CHECK-NEXT: [[RET_INT:%.*]] = or i160 [[TMP1]], [[RET_INT_OFF]] +; CHECK-NEXT: store i160 [[RET_INT]], ptr [[ARG]], align 32 +; CHECK-NEXT: ret void +; + %bound = load i32, ptr %arg + %ret = call ptr addrspace(7) @recur.inner.1(ptr addrspace(7) %x, i32 %bound) + store ptr addrspace(7) %ret, ptr %arg + ret void +} + +declare ptr addrspace(7) @extern(ptr addrspace(7) %arg) +define void @caller(ptr addrspace(7) noundef nonnull %arg) { +; CHECK-LABEL: define void @caller +; CHECK-SAME: ({ ptr addrspace(8), i32 } noundef [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ARG_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[ARG]], 0 +; CHECK-NEXT: [[ARG_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[ARG]], 1 +; CHECK-NEXT: [[V:%.*]] = call { ptr addrspace(8), i32 } @extern({ ptr addrspace(8), i32 } [[ARG]]) +; CHECK-NEXT: [[V_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[V]], 0 +; CHECK-NEXT: [[V_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[V]], 1 +; CHECK-NEXT: [[V_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[V_RSRC]] to i160 +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i160 [[V_INT_RSRC]], 32 +; CHECK-NEXT: [[V_INT_OFF:%.*]] = zext i32 [[V_OFF]] to i160 +; CHECK-NEXT: [[V_INT:%.*]] = or i160 [[TMP1]], [[V_INT_OFF]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i160(i160 [[V_INT]], ptr addrspace(8) [[ARG_RSRC]], i32 [[ARG_OFF]], i32 0, i32 0), !amdgpu.align !0 +; CHECK-NEXT: ret void +; + %v = call ptr addrspace(7) @extern(ptr addrspace(7) %arg) + store ptr addrspace(7) %v, ptr addrspace(7) %arg + ret void +} + +define internal noalias noundef nonnull ptr addrspace(7) @foo(ptr addrspace(7) noalias noundef nonnull %arg) { +; CHECK-LABEL: define internal noundef { ptr addrspace(8), i32 } @foo +; CHECK-SAME: ({ ptr addrspace(8), i32 } noundef [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ARG_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[ARG]], 0 +; CHECK-NEXT: [[ARG_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[ARG]], 1 +; CHECK-NEXT: [[RET:%.*]] = add nuw i32 [[ARG_OFF]], 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[ARG_RSRC]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[RET]], 1 +; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[TMP2]] +; + %ret = getelementptr inbounds i32, ptr addrspace(7) %arg, i32 1 + ret ptr addrspace(7) %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-constants.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-constants.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-constants.ll @@ -0,0 +1,220 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -S -mcpu=gfx900 -amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s +; RUN: opt -S -mcpu=gfx900 -passes=amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8" +target triple = "amdgcn--" + +@buf = external addrspace(8) global i8 +@flat = external global i8 + +define ptr addrspace(7) @null() { +; CHECK-LABEL: define { ptr addrspace(8), i32 } @null +; CHECK-SAME: () #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: ret { ptr addrspace(8), i32 } zeroinitializer +; + ret ptr addrspace(7) null +} + +define <2 x ptr addrspace(7)> @null_vector() { +; CHECK-LABEL: define { <2 x ptr addrspace(8)>, <2 x i32> } @null_vector +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: ret { <2 x ptr addrspace(8)>, <2 x i32> } zeroinitializer +; + ret <2 x ptr addrspace(7)> zeroinitializer +} + +define ptr addrspace(7) @undef() { +; CHECK-LABEL: define { ptr addrspace(8), i32 } @undef +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: ret { ptr addrspace(8), i32 } undef +; + ret ptr addrspace(7) undef +} + +define <2 x ptr addrspace(7)> @undef_vec() { +; CHECK-LABEL: define { <2 x ptr addrspace(8)>, <2 x i32> } @undef_vec +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: ret { <2 x ptr addrspace(8)>, <2 x i32> } undef +; + ret <2 x ptr addrspace(7)> undef +} + +define ptr addrspace(7) @poison() { +; CHECK-LABEL: define { ptr addrspace(8), i32 } @poison +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: ret { ptr addrspace(8), i32 } poison +; + ret ptr addrspace(7) poison +} + +define <2 x ptr addrspace(7)> @poison_vec() { +; CHECK-LABEL: define { <2 x ptr addrspace(8)>, <2 x i32> } @poison_vec +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: ret { <2 x ptr addrspace(8)>, <2 x i32> } poison +; + ret <2 x ptr addrspace(7)> poison +} + +define ptr addrspace(7) @cast_global() { +; CHECK-LABEL: define { ptr addrspace(8), i32 } @cast_global +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: ret { ptr addrspace(8), i32 } { ptr addrspace(8) @buf, i32 0 } +; + ret ptr addrspace(7) addrspacecast (ptr addrspace(8) @buf to ptr addrspace(7)) +} + +define ptr addrspace(7) @cast_null() { +; CHECK-LABEL: define { ptr addrspace(8), i32 } @cast_null +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: ret { ptr addrspace(8), i32 } zeroinitializer +; + ret ptr addrspace(7) addrspacecast (ptr addrspace(8) null to ptr addrspace(7)) +} + +define <2 x ptr addrspace(7)> @cast_vec() { +; CHECK-LABEL: define { <2 x ptr addrspace(8)>, <2 x i32> } @cast_vec +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: ret { <2 x ptr addrspace(8)>, <2 x i32> } { <2 x ptr addrspace(8)> , <2 x i32> zeroinitializer } +; + ret <2 x ptr addrspace(7)> addrspacecast ( + <2 x ptr addrspace(8)> + to <2 x ptr addrspace(7)>) +} + +define ptr addrspace(7) @gep() { +; CHECK-LABEL: define { ptr addrspace(8), i32 } @gep +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: ret { ptr addrspace(8), i32 } { ptr addrspace(8) @buf, i32 36 } +; + ret ptr addrspace(7) getelementptr inbounds ( + [4 x i32], + ptr addrspace(7) addrspacecast (ptr addrspace(8) @buf to ptr addrspace(7)), + i64 2, i32 1) +} + +define <2 x ptr addrspace(7)> @gep_vector() { +; CHECK-LABEL: define { <2 x ptr addrspace(8)>, <2 x i32> } @gep_vector +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: ret { <2 x ptr addrspace(8)>, <2 x i32> } { <2 x ptr addrspace(8)> , <2 x i32> } +; + ret <2 x ptr addrspace(7)> getelementptr ( + i32, + <2 x ptr addrspace(7)> + , + <2 x i32> ) +} + +define ptr @gep_of_p7() { +; CHECK-LABEL: define ptr @gep_of_p7 +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: ret ptr getelementptr inbounds (ptr addrspace(7), ptr @flat, i64 2) +; + ret ptr getelementptr inbounds (ptr addrspace(7), ptr @flat, i64 2) +} + +define ptr @gep_of_p7_vector() { +; CHECK-LABEL: define ptr @gep_of_p7_vector +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: ret ptr getelementptr (<2 x ptr addrspace(7)>, ptr @flat, i64 2) +; + ret ptr getelementptr (<2 x ptr addrspace(7)>, ptr @flat, i64 2) +} + +define ptr @gep_of_p7_struct() { +; CHECK-LABEL: define ptr @gep_of_p7_struct +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: ret ptr getelementptr ({ ptr addrspace(7), i32 }, ptr @flat, i64 2) +; + ret ptr getelementptr ({ptr addrspace(7), i32}, ptr @flat, i64 2) +} + +define ptr addrspace(7) @gep_p7_from_p7() { +; CHECK-LABEL: define { ptr addrspace(8), i32 } @gep_p7_from_p7 +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: ret { ptr addrspace(8), i32 } { ptr addrspace(8) @buf, i32 48 } +; + ret ptr addrspace(7) getelementptr (ptr addrspace(7), + ptr addrspace(7) addrspacecast (ptr addrspace(8) @buf to ptr addrspace(7)), + i64 2) +} + +define i160 @ptrtoint() { +; CHECK-LABEL: define i160 @ptrtoint +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: ret i160 add nuw nsw (i160 shl nuw (i160 ptrtoint (ptr addrspace(8) @buf to i160), i160 32), i160 12) +; + ret i160 ptrtoint( + ptr addrspace(7) getelementptr( + i32, ptr addrspace(7) addrspacecast (ptr addrspace(8) @buf to ptr addrspace(7)), + i32 3) to i160) +} + +define i256 @ptrtoint_long() { +; CHECK-LABEL: define i256 @ptrtoint_long +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: ret i256 add nuw nsw (i256 shl nuw nsw (i256 ptrtoint (ptr addrspace(8) @buf to i256), i256 32), i256 12) +; + ret i256 ptrtoint( + ptr addrspace(7) getelementptr( + i32, ptr addrspace(7) addrspacecast (ptr addrspace(8) @buf to ptr addrspace(7)), + i32 3) to i256) +} + +define i64 @ptrtoint_short() { +; CHECK-LABEL: define i64 @ptrtoint_short +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: ret i64 add nuw nsw (i64 shl (i64 ptrtoint (ptr addrspace(8) @buf to i64), i64 32), i64 12) +; + ret i64 ptrtoint( + ptr addrspace(7) getelementptr( + i32, ptr addrspace(7) addrspacecast (ptr addrspace(8) @buf to ptr addrspace(7)), + i32 3) to i64) +} + +define i32 @ptrtoint_very_short() { +; CHECK-LABEL: define i32 @ptrtoint_very_short +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: ret i32 add nuw nsw (i32 shl (i32 ptrtoint (ptr addrspace(8) @buf to i32), i32 32), i32 12) +; + ret i32 ptrtoint( + ptr addrspace(7) getelementptr( + i32, ptr addrspace(7) addrspacecast (ptr addrspace(8) @buf to ptr addrspace(7)), + i32 3) to i32) +} + + +define <2 x i160> @ptrtoint_vec() { +; CHECK-LABEL: define <2 x i160> @ptrtoint_vec +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: ret <2 x i160> zeroinitializer +; + ret <2 x i160> ptrtoint (<2 x ptr addrspace(7)> zeroinitializer to <2 x i160>) +} + +define ptr addrspace(7) @inttoptr() { +; CHECK-LABEL: define { ptr addrspace(8), i32 } @inttoptr +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: ret { ptr addrspace(8), i32 } zeroinitializer +; + ret ptr addrspace(7) inttoptr (i160 0 to ptr addrspace(7)) +} + +define <2 x ptr addrspace(7)> @inttoptr_vec() { +; CHECK-LABEL: define { <2 x ptr addrspace(8)>, <2 x i32> } @inttoptr_vec +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: ret { <2 x ptr addrspace(8)>, <2 x i32> } { <2 x ptr addrspace(8)> zeroinitializer, <2 x i32> } +; + ret <2 x ptr addrspace(7)> inttoptr (<2 x i160> to <2 x ptr addrspace(7)>) +} + +define i32 @fancy_zero() { +; CHECK-LABEL: define i32 @fancy_zero +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: ret i32 shl (i32 ptrtoint (ptr addrspace(8) @buf to i32), i32 32) +; + ret i32 ptrtoint ( + ptr addrspace(7) addrspacecast (ptr addrspace(8) @buf to ptr addrspace(7)) + to i32) +} diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-control-flow.ll @@ -0,0 +1,354 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -S -mcpu=gfx900 -amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s +; RUN: opt -S -mcpu=gfx900 -passes=amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8" +target triple = "amdgcn--" + +;; This should optimize to just the offset part +define float @sum(ptr addrspace(8) %buf, i32 %len) { +; CHECK-LABEL: define float @sum +; CHECK-SAME: (ptr addrspace(8) [[BUF:%.*]], i32 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[SUM_PREV:%.*]] = phi float [ [[SUM:%.*]], [[LOOP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[PTR_PREV_OFF:%.*]] = phi i32 [ [[PTR:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) [[BUF]], i32 [[PTR_PREV_OFF]], i32 0, i32 0), !amdgpu.align !0 +; CHECK-NEXT: [[SUM]] = fadd float [[SUM_PREV]], [[VAL]] +; CHECK-NEXT: [[PTR]] = add i32 [[PTR_PREV_OFF]], 4 +; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1 +; CHECK-NEXT: [[TEST:%.*]] = icmp ult i32 [[I_NEXT]], [[LEN]] +; CHECK-NEXT: br i1 [[TEST]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret float [[SUM]] +; +entry: + %start = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + br label %loop +loop: + %sum.prev = phi float [ %sum, %loop ], [ 0.0, %entry ] + %ptr.prev = phi ptr addrspace(7) [ %ptr, %loop ], [ %start, %entry ] + %i = phi i32 [ %i.next, %loop ], [ 0, %entry ] + + %val = load float, ptr addrspace(7) %ptr.prev + %sum = fadd float %sum.prev, %val + + %ptr = getelementptr float, ptr addrspace(7) %ptr.prev, i32 1 + %i.next = add i32 %i, 1 + %test = icmp ult i32 %i.next, %len + br i1 %test, label %loop, label %exit +exit: + ret float %sum +} + +;; But this should not +define float @sum_integer_ops(ptr addrspace(8) %buf, i32 %len) { +; CHECK-LABEL: define float @sum_integer_ops +; CHECK-SAME: (ptr addrspace(8) [[BUF:%.*]], i32 [[LEN:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[SUM_PREV:%.*]] = phi float [ [[SUM:%.*]], [[LOOP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[PTR_PREV_RSRC:%.*]] = phi ptr addrspace(8) [ [[PTR_RSRC:%.*]], [[LOOP]] ], [ [[BUF]], [[ENTRY]] ] +; CHECK-NEXT: [[PTR_PREV_OFF:%.*]] = phi i32 [ [[PTR_OFF:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) [[PTR_PREV_RSRC]], i32 [[PTR_PREV_OFF]], i32 0, i32 0), !amdgpu.align !0 +; CHECK-NEXT: [[SUM]] = fadd float [[SUM_PREV]], [[VAL]] +; CHECK-NEXT: [[PTR_PREV_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[PTR_PREV_RSRC]] to i160 +; CHECK-NEXT: [[TMP0:%.*]] = shl nuw i160 [[PTR_PREV_INT_RSRC]], 32 +; CHECK-NEXT: [[PTR_PREV_INT_OFF:%.*]] = zext i32 [[PTR_PREV_OFF]] to i160 +; CHECK-NEXT: [[PTR_PREV_INT:%.*]] = or i160 [[TMP0]], [[PTR_PREV_INT_OFF]] +; CHECK-NEXT: [[PTR_INT:%.*]] = add i160 [[PTR_PREV_INT]], 4 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i160 [[PTR_INT]], 32 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i160 [[TMP1]] to i128 +; CHECK-NEXT: [[PTR_RSRC]] = inttoptr i128 [[TMP2]] to ptr addrspace(8) +; CHECK-NEXT: [[PTR_OFF]] = trunc i160 [[PTR_INT]] to i32 +; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1 +; CHECK-NEXT: [[TEST:%.*]] = icmp ult i32 [[I_NEXT]], [[LEN]] +; CHECK-NEXT: br i1 [[TEST]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret float [[SUM]] +; +entry: + %start = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + br label %loop +loop: + %sum.prev = phi float [ %sum, %loop ], [ 0.0, %entry ] + %ptr.prev = phi ptr addrspace(7) [ %ptr, %loop ], [ %start, %entry ] + %i = phi i32 [ %i.next, %loop ], [ 0, %entry ] + + %val = load float, ptr addrspace(7) %ptr.prev + %sum = fadd float %sum.prev, %val + + %ptr.prev.int = ptrtoint ptr addrspace(7) %ptr.prev to i160 + %ptr.int = add i160 %ptr.prev.int, 4 + %ptr = inttoptr i160 %ptr.int to ptr addrspace(7) + %i.next = add i32 %i, 1 + %test = icmp ult i32 %i.next, %len + br i1 %test, label %loop, label %exit +exit: + ret float %sum +} + +;; Should go to offsets only +define float @sum_2d(ptr addrspace(8) %buf, i32 %ii, i32 %jj) { +; CHECK-LABEL: define float @sum_2d +; CHECK-SAME: (ptr addrspace(8) [[BUF:%.*]], i32 [[II:%.*]], i32 [[JJ:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP1_ENTRY:%.*]] +; CHECK: loop1.entry: +; CHECK-NEXT: [[SUM1_PREV:%.*]] = phi float [ [[SUM:%.*]], [[LOOP1_EXIT:%.*]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[LOOP1_EXIT]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[PTR1_PREV_OFF:%.*]] = phi i32 [ [[PTR1:%.*]], [[LOOP1_EXIT]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[LOOP2:%.*]] +; CHECK: loop2: +; CHECK-NEXT: [[SUM2_PREV:%.*]] = phi float [ [[SUM]], [[LOOP2]] ], [ [[SUM1_PREV]], [[LOOP1_ENTRY]] ] +; CHECK-NEXT: [[J:%.*]] = phi i32 [ [[J_NEXT:%.*]], [[LOOP2]] ], [ 0, [[LOOP1_ENTRY]] ] +; CHECK-NEXT: [[PTR2_PREV_OFF:%.*]] = phi i32 [ [[PTR2:%.*]], [[LOOP2]] ], [ [[PTR1_PREV_OFF]], [[LOOP1_ENTRY]] ] +; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) [[BUF]], i32 [[PTR2_PREV_OFF]], i32 0, i32 0), !amdgpu.align !0 +; CHECK-NEXT: [[SUM]] = fadd float [[SUM2_PREV]], [[VAL]] +; CHECK-NEXT: [[PTR2]] = add i32 [[PTR2_PREV_OFF]], 4 +; CHECK-NEXT: [[J_NEXT]] = add i32 [[J]], 1 +; CHECK-NEXT: [[TEST2:%.*]] = icmp ult i32 [[J_NEXT]], [[JJ]] +; CHECK-NEXT: br i1 [[TEST2]], label [[LOOP2]], label [[LOOP1_EXIT]] +; CHECK: loop1.exit: +; CHECK-NEXT: [[PTR1]] = add i32 [[PTR2]], 4 +; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1 +; CHECK-NEXT: [[TEST1:%.*]] = icmp ult i32 [[I_NEXT]], [[II]] +; CHECK-NEXT: br i1 [[TEST1]], label [[LOOP1_ENTRY]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret float [[SUM]] +; +entry: + %start = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + br label %loop1.entry +loop1.entry: + %sum1.prev = phi float [ %sum, %loop1.exit ], [ 0.0, %entry ] + %ptr1.prev = phi ptr addrspace(7) [ %ptr1, %loop1.exit ], [ %start, %entry ] + %i = phi i32 [ %i.next, %loop1.exit ], [ 0, %entry ] + + br label %loop2 +loop2: + %sum2.prev = phi float [ %sum, %loop2 ], [ %sum1.prev, %loop1.entry ] + %ptr2.prev = phi ptr addrspace(7) [ %ptr2, %loop2 ], [ %ptr1.prev, %loop1.entry ] + %j = phi i32 [ %j.next, %loop2 ], [ 0, %loop1.entry ] + + %val = load float, ptr addrspace(7) %ptr2.prev + %sum = fadd float %sum2.prev, %val + + %ptr2 = getelementptr float, ptr addrspace(7) %ptr2.prev, i32 1 + %j.next = add i32 %j, 1 + %test2 = icmp ult i32 %j.next, %jj + + br i1 %test2, label %loop2, label %loop1.exit +loop1.exit: + %ptr1 = getelementptr float, ptr addrspace(7) %ptr2, i32 1 + %i.next = add i32 %i, 1 + %test1 = icmp ult i32 %i.next, %ii + br i1 %test1, label %loop1.entry, label %exit +exit: + ret float %sum +} + +;; This should optimize to just the offset parts since all the arguments to the +;; select point to the same buffer. +define float @sum_jump_on_negative(ptr addrspace(8) %buf, i32 %len) { +; CHECK-LABEL: define float @sum_jump_on_negative +; CHECK-SAME: (ptr addrspace(8) [[BUF:%.*]], i32 [[LEN:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[SUM_PREV:%.*]] = phi float [ [[SUM:%.*]], [[LOOP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[PTR_PREV_OFF:%.*]] = phi i32 [ [[PTR_OFF:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) [[BUF]], i32 [[PTR_PREV_OFF]], i32 0, i32 0), !amdgpu.align !0 +; CHECK-NEXT: [[SUM]] = fadd float [[SUM_PREV]], [[VAL]] +; CHECK-NEXT: [[SKIP_NEXT:%.*]] = fcmp olt float [[VAL]], 0.000000e+00 +; CHECK-NEXT: [[SMALL_JUMP:%.*]] = add i32 [[PTR_PREV_OFF]], 4 +; CHECK-NEXT: [[LARGE_JUMP:%.*]] = add i32 [[PTR_PREV_OFF]], 8 +; CHECK-NEXT: [[PTR_OFF]] = select i1 [[SKIP_NEXT]], i32 [[LARGE_JUMP]], i32 [[SMALL_JUMP]] +; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1 +; CHECK-NEXT: [[TEST:%.*]] = icmp ult i32 [[I_NEXT]], [[LEN]] +; CHECK-NEXT: br i1 [[TEST]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret float [[SUM]] +; +entry: + %start = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + br label %loop +loop: + %sum.prev = phi float [ %sum, %loop ], [ 0.0, %entry ] + %ptr.prev = phi ptr addrspace(7) [ %ptr, %loop ], [ %start, %entry ] + %i = phi i32 [ %i.next, %loop ], [ 0, %entry ] + + %val = load float, ptr addrspace(7) %ptr.prev + %sum = fadd float %sum.prev, %val + + %skip.next = fcmp olt float %val, 0.0 + %small.jump = getelementptr float, ptr addrspace(7) %ptr.prev, i32 1 + %large.jump = getelementptr float, ptr addrspace(7) %ptr.prev, i32 2 + %ptr = select i1 %skip.next, ptr addrspace(7) %large.jump, ptr addrspace(7) %small.jump + + %i.next = add i32 %i, 1 + %test = icmp ult i32 %i.next, %len + br i1 %test, label %loop, label %exit +exit: + ret float %sum +} + +define float @sum_jump_on_negative_with_phi(ptr addrspace(8) %buf, i32 %len) { +; CHECK-LABEL: define float @sum_jump_on_negative_with_phi +; CHECK-SAME: (ptr addrspace(8) [[BUF:%.*]], i32 [[LEN:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[SUM_PREV:%.*]] = phi float [ [[SUM:%.*]], [[LOOP_EXIT:%.*]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[LOOP_EXIT]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[PTR_PREV_OFF:%.*]] = phi i32 [ [[PTR_OFF:%.*]], [[LOOP_EXIT]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) [[BUF]], i32 [[PTR_PREV_OFF]], i32 0, i32 0), !amdgpu.align !0 +; CHECK-NEXT: [[SUM]] = fadd float [[SUM_PREV]], [[VAL]] +; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1 +; CHECK-NEXT: [[TEST:%.*]] = icmp ult i32 [[I_NEXT]], [[LEN]] +; CHECK-NEXT: [[SKIP_NEXT:%.*]] = fcmp olt float [[VAL]], 0.000000e+00 +; CHECK-NEXT: br i1 [[SKIP_NEXT]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: [[LARGE_JUMP:%.*]] = add i32 [[PTR_PREV_OFF]], 8 +; CHECK-NEXT: br label [[LOOP_EXIT]] +; CHECK: else: +; CHECK-NEXT: [[SMALL_JUMP:%.*]] = add i32 [[PTR_PREV_OFF]], 4 +; CHECK-NEXT: br label [[LOOP_EXIT]] +; CHECK: loop.exit: +; CHECK-NEXT: [[PTR_OFF]] = phi i32 [ [[LARGE_JUMP]], [[THEN]] ], [ [[SMALL_JUMP]], [[ELSE]] ] +; CHECK-NEXT: br i1 [[TEST]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret float [[SUM]] +; +entry: + %start = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + br label %loop +loop: + %sum.prev = phi float [ %sum, %loop.exit ], [ 0.0, %entry ] + %ptr.prev = phi ptr addrspace(7) [ %ptr, %loop.exit ], [ %start, %entry ] + %i = phi i32 [ %i.next, %loop.exit ], [ 0, %entry ] + + %val = load float, ptr addrspace(7) %ptr.prev + %sum = fadd float %sum.prev, %val + + %i.next = add i32 %i, 1 + %test = icmp ult i32 %i.next, %len + + %skip.next = fcmp olt float %val, 0.0 + br i1 %skip.next, label %then, label %else +then: + %large.jump = getelementptr float, ptr addrspace(7) %ptr.prev, i32 2 + br label %loop.exit +else: + %small.jump = getelementptr float, ptr addrspace(7) %ptr.prev, i32 1 + br label %loop.exit +loop.exit: + %ptr = phi ptr addrspace(7) [ %large.jump, %then ], [ %small.jump, %else ] + br i1 %test, label %loop, label %exit +exit: + ret float %sum +} + +;; But this has a shifting resource part. +define float @sum_new_buffer_on_negative(ptr addrspace(8) %buf1, ptr addrspace(8) %buf2, i32 %len) { +; CHECK-LABEL: define float @sum_new_buffer_on_negative +; CHECK-SAME: (ptr addrspace(8) [[BUF1:%.*]], ptr addrspace(8) [[BUF2:%.*]], i32 [[LEN:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[SUM_PREV:%.*]] = phi float [ [[SUM:%.*]], [[LOOP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[PTR_PREV_RSRC:%.*]] = phi ptr addrspace(8) [ [[PTR_RSRC:%.*]], [[LOOP]] ], [ [[BUF1]], [[ENTRY]] ] +; CHECK-NEXT: [[PTR_PREV_OFF:%.*]] = phi i32 [ [[PTR_OFF:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) [[PTR_PREV_RSRC]], i32 [[PTR_PREV_OFF]], i32 0, i32 0), !amdgpu.align !0 +; CHECK-NEXT: [[SUM]] = fadd float [[SUM_PREV]], [[VAL]] +; CHECK-NEXT: [[HOP:%.*]] = fcmp olt float [[VAL]], 0.000000e+00 +; CHECK-NEXT: [[THIS_NEXT:%.*]] = add i32 [[PTR_PREV_OFF]], 4 +; CHECK-NEXT: [[PTR_RSRC]] = select i1 [[HOP]], ptr addrspace(8) [[PTR_PREV_RSRC]], ptr addrspace(8) [[BUF2]] +; CHECK-NEXT: [[PTR_OFF]] = select i1 [[HOP]], i32 [[THIS_NEXT]], i32 0 +; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1 +; CHECK-NEXT: [[TEST:%.*]] = icmp ult i32 [[I_NEXT]], [[LEN]] +; CHECK-NEXT: br i1 [[TEST]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret float [[SUM]] +; +entry: + %start = addrspacecast ptr addrspace(8) %buf1 to ptr addrspace(7) + %start2 = addrspacecast ptr addrspace(8) %buf2 to ptr addrspace(7) + br label %loop +loop: + %sum.prev = phi float [ %sum, %loop ], [ 0.0, %entry ] + %ptr.prev = phi ptr addrspace(7) [ %ptr, %loop ], [ %start, %entry ] + %i = phi i32 [ %i.next, %loop ], [ 0, %entry ] + + %val = load float, ptr addrspace(7) %ptr.prev + %sum = fadd float %sum.prev, %val + + %hop = fcmp olt float %val, 0.0 + %this.next = getelementptr float, ptr addrspace(7) %ptr.prev, i32 1 + %ptr = select i1 %hop, ptr addrspace(7) %this.next, ptr addrspace(7) %start2 + + %i.next = add i32 %i, 1 + %test = icmp ult i32 %i.next, %len + br i1 %test, label %loop, label %exit +exit: + ret float %sum +} + +;; As does this. +define float @sum_new_buffer_on_negative_with_phi(ptr addrspace(8) %buf1, ptr addrspace(8) %buf2, i32 %len) { +; CHECK-LABEL: define float @sum_new_buffer_on_negative_with_phi +; CHECK-SAME: (ptr addrspace(8) [[BUF1:%.*]], ptr addrspace(8) [[BUF2:%.*]], i32 [[LEN:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[SUM_PREV:%.*]] = phi float [ [[SUM:%.*]], [[LOOP_EXIT:%.*]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[LOOP_EXIT]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[PTR_PREV_RSRC:%.*]] = phi ptr addrspace(8) [ [[PTR_RSRC:%.*]], [[LOOP_EXIT]] ], [ [[BUF1]], [[ENTRY]] ] +; CHECK-NEXT: [[PTR_PREV_OFF:%.*]] = phi i32 [ [[PTR_OFF:%.*]], [[LOOP_EXIT]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) [[PTR_PREV_RSRC]], i32 [[PTR_PREV_OFF]], i32 0, i32 0), !amdgpu.align !0 +; CHECK-NEXT: [[SUM]] = fadd float [[SUM_PREV]], [[VAL]] +; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1 +; CHECK-NEXT: [[TEST:%.*]] = icmp ult i32 [[I_NEXT]], [[LEN]] +; CHECK-NEXT: [[HOP:%.*]] = fcmp olt float [[VAL]], 0.000000e+00 +; CHECK-NEXT: br i1 [[HOP]], label [[THEN:%.*]], label [[LOOP_EXIT]] +; CHECK: then: +; CHECK-NEXT: [[THIS_NEXT:%.*]] = add i32 [[PTR_PREV_OFF]], 4 +; CHECK-NEXT: br label [[LOOP_EXIT]] +; CHECK: loop.exit: +; CHECK-NEXT: [[PTR_RSRC]] = phi ptr addrspace(8) [ [[PTR_PREV_RSRC]], [[THEN]] ], [ [[BUF2]], [[LOOP]] ] +; CHECK-NEXT: [[PTR_OFF]] = phi i32 [ [[THIS_NEXT]], [[THEN]] ], [ 0, [[LOOP]] ] +; CHECK-NEXT: br i1 [[TEST]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret float [[SUM]] +; +entry: + %start = addrspacecast ptr addrspace(8) %buf1 to ptr addrspace(7) + %start2 = addrspacecast ptr addrspace(8) %buf2 to ptr addrspace(7) + br label %loop +loop: + %sum.prev = phi float [ %sum, %loop.exit ], [ 0.0, %entry ] + %ptr.prev = phi ptr addrspace(7) [ %ptr, %loop.exit ], [ %start, %entry ] + %i = phi i32 [ %i.next, %loop.exit ], [ 0, %entry ] + + %val = load float, ptr addrspace(7) %ptr.prev + %sum = fadd float %sum.prev, %val + + %i.next = add i32 %i, 1 + %test = icmp ult i32 %i.next, %len + %hop = fcmp olt float %val, 0.0 + br i1 %hop, label %then, label %loop.exit +then: + %this.next = getelementptr float, ptr addrspace(7) %ptr.prev, i32 1 + br label %loop.exit +loop.exit: + %ptr = phi ptr addrspace(7) [ %this.next, %then ], [ %start2, %loop ] + br i1 %test, label %loop, label %exit +exit: + ret float %sum +} diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-memops.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-memops.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-memops.ll @@ -0,0 +1,189 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -S -mcpu=gfx900 -amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s +; RUN: opt -S -mcpu=gfx900 -passes=amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8" +target triple = "amdgcn--" + +define void @loads(ptr addrspace(8) %buf) { +; CHECK-LABEL: define void @loads +; CHECK-SAME: (ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[SCALAR:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 0), !amdgpu.align !0 +; CHECK-NEXT: [[VEC2:%.*]] = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 0), !amdgpu.align !1 +; CHECK-NEXT: [[VEC4:%.*]] = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 0), !amdgpu.align !2 +; CHECK-NEXT: [[NONTEMPORAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 2), !nontemporal !3, !amdgpu.align !0 +; CHECK-NEXT: [[INVARIANT:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 0), !invariant.load !4, !amdgpu.align !0 +; CHECK-NEXT: [[NONTEMPORAL_INVARIANT:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 0), !invariant.load !4, !nontemporal !3, !amdgpu.align !0 +; CHECK-NEXT: [[VOLATILE:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 0), !amdgpu.align !0, !amdgpu.volatile !4 +; CHECK-NEXT: [[VOLATILE_NONTEMPORAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 2), !nontemporal !3, !amdgpu.align !0, !amdgpu.volatile !4 +; CHECK-NEXT: fence syncscope("wavefront") release +; CHECK-NEXT: [[ATOMIC:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 1), !amdgpu.align !0, !amdgpu.volatile !4 +; CHECK-NEXT: fence syncscope("wavefront") acquire +; CHECK-NEXT: [[ATOMIC_MONOTONIC:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 1), !amdgpu.align !0 +; CHECK-NEXT: [[ATOMIC_ACQUIRE:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 1), !amdgpu.align !0 +; CHECK-NEXT: fence acquire +; CHECK-NEXT: ret void +; + %base = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %p = getelementptr float, ptr addrspace(7) %base, i32 4 + + %scalar = load float, ptr addrspace(7) %p, align 4 + %vec2 = load <2 x float>, ptr addrspace(7) %p, align 8 + %vec4 = load <4 x float>, ptr addrspace(7) %p, align 16 + + %nontemporal = load float, ptr addrspace(7) %p, !nontemporal !0 + %invariant = load float, ptr addrspace(7) %p, !invariant.load !1 + %nontemporal.invariant = load float, ptr addrspace(7) %p, !nontemporal !0, !invariant.load !1 + + %volatile = load volatile float, ptr addrspace(7) %p + %volatile.nontemporal = load volatile float, ptr addrspace(7) %p, !nontemporal !0 + + %atomic = load atomic volatile float, ptr addrspace(7) %p syncscope("wavefront") seq_cst, align 4 + %atomic.monotonic = load atomic float, ptr addrspace(7) %p syncscope("wavefront") monotonic, align 4 + %atomic.acquire = load atomic float, ptr addrspace(7) %p acquire, align 4 + + ret void +} + +define void @stores(ptr addrspace(8) %buf, float %f, <4 x float> %f4) { +; CHECK-LABEL: define void @stores +; CHECK-SAME: (ptr addrspace(8) [[BUF:%.*]], float [[F:%.*]], <4 x float> [[F4:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 0), !amdgpu.align !0 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> [[F4]], ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 0), !amdgpu.align !2 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 2), !nontemporal !3, !amdgpu.align !0 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 0), !amdgpu.align !0, !amdgpu.volatile !4 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 2), !nontemporal !3, !amdgpu.align !0, !amdgpu.volatile !4 +; CHECK-NEXT: fence syncscope("wavefront") release +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 1), !amdgpu.align !0, !amdgpu.volatile !4 +; CHECK-NEXT: fence syncscope("wavefront") acquire +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 1), !amdgpu.align !0 +; CHECK-NEXT: fence release +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[F]], ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 1), !amdgpu.align !0 +; CHECK-NEXT: ret void +; + %base = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %p = getelementptr float, ptr addrspace(7) %base, i32 4 + + store float %f, ptr addrspace(7) %p, align 4 + store <4 x float> %f4, ptr addrspace(7) %p, align 16 + + store float %f, ptr addrspace(7) %p, !nontemporal !0 + + store volatile float %f, ptr addrspace(7) %p + store volatile float %f, ptr addrspace(7) %p, !nontemporal !0 + + store atomic volatile float %f, ptr addrspace(7) %p syncscope("wavefront") seq_cst, align 4 + store atomic float %f, ptr addrspace(7) %p syncscope("wavefront") monotonic, align 4 + store atomic float %f, ptr addrspace(7) %p release, align 4 + + ret void +} + +define void @atomicrmw(ptr addrspace(8) %buf, float %f, i32 %i) { +; CHECK-LABEL: define void @atomicrmw +; CHECK-SAME: (ptr addrspace(8) [[BUF:%.*]], float [[F:%.*]], i32 [[I:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: fence syncscope("wavefront") release +; CHECK-NEXT: [[XCHG:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.swap.i32(i32 [[I]], ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 0), !amdgpu.align !0 +; CHECK-NEXT: fence syncscope("wavefront") acquire +; CHECK-NEXT: fence syncscope("wavefront") release +; CHECK-NEXT: [[ADD:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add.i32(i32 [[I]], ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 0), !amdgpu.align !0 +; CHECK-NEXT: fence syncscope("wavefront") acquire +; CHECK-NEXT: fence syncscope("wavefront") release +; CHECK-NEXT: [[SUB:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub.i32(i32 [[I]], ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 0), !amdgpu.align !0 +; CHECK-NEXT: fence syncscope("wavefront") acquire +; CHECK-NEXT: fence syncscope("wavefront") release +; CHECK-NEXT: [[AND:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.and.i32(i32 [[I]], ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 0), !amdgpu.align !0 +; CHECK-NEXT: fence syncscope("wavefront") acquire +; CHECK-NEXT: fence syncscope("wavefront") release +; CHECK-NEXT: [[OR:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.or.i32(i32 [[I]], ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 0), !amdgpu.align !0 +; CHECK-NEXT: fence syncscope("wavefront") acquire +; CHECK-NEXT: fence syncscope("wavefront") release +; CHECK-NEXT: [[XOR:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.xor.i32(i32 [[I]], ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 0), !amdgpu.align !0 +; CHECK-NEXT: fence syncscope("wavefront") acquire +; CHECK-NEXT: fence syncscope("wavefront") release +; CHECK-NEXT: [[MIN:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.smin.i32(i32 [[I]], ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 0), !amdgpu.align !0 +; CHECK-NEXT: fence syncscope("wavefront") acquire +; CHECK-NEXT: fence syncscope("wavefront") release +; CHECK-NEXT: [[MAX:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.smax.i32(i32 [[I]], ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 0), !amdgpu.align !0 +; CHECK-NEXT: fence syncscope("wavefront") acquire +; CHECK-NEXT: fence syncscope("wavefront") release +; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.umin.i32(i32 [[I]], ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 0), !amdgpu.align !0 +; CHECK-NEXT: fence syncscope("wavefront") acquire +; CHECK-NEXT: fence syncscope("wavefront") release +; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.umax.i32(i32 [[I]], ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 0), !amdgpu.align !0 +; CHECK-NEXT: fence syncscope("wavefront") acquire +; CHECK-NEXT: fence syncscope("wavefront") release +; CHECK-NEXT: [[FADD:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float [[F]], ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 0), !amdgpu.align !0 +; CHECK-NEXT: fence syncscope("wavefront") acquire +; CHECK-NEXT: fence syncscope("wavefront") release +; CHECK-NEXT: [[FMAX:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f32(float [[F]], ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 0), !amdgpu.align !0 +; CHECK-NEXT: fence syncscope("wavefront") acquire +; CHECK-NEXT: fence syncscope("wavefront") release +; CHECK-NEXT: [[FMIN:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f32(float [[F]], ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 0), !amdgpu.align !0 +; CHECK-NEXT: fence syncscope("wavefront") acquire +; CHECK-NEXT: fence syncscope("wavefront") release +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add.i32(i32 [[I]], ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 0), !amdgpu.align !0 +; CHECK-NEXT: fence syncscope("wavefront") acquire +; CHECK-NEXT: ret void +; + %base = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %p = getelementptr float, ptr addrspace(7) %base, i32 4 + + ; Fence insertion is tested by loads and stores + %xchg = atomicrmw xchg ptr addrspace(7) %p, i32 %i syncscope("wavefront") seq_cst, align 4 + %add = atomicrmw add ptr addrspace(7) %p, i32 %i syncscope("wavefront") seq_cst, align 4 + %sub = atomicrmw sub ptr addrspace(7) %p, i32 %i syncscope("wavefront") seq_cst, align 4 + %and = atomicrmw and ptr addrspace(7) %p, i32 %i syncscope("wavefront") seq_cst, align 4 + %or = atomicrmw or ptr addrspace(7) %p, i32 %i syncscope("wavefront") seq_cst, align 4 + %xor = atomicrmw xor ptr addrspace(7) %p, i32 %i syncscope("wavefront") seq_cst, align 4 + %min = atomicrmw min ptr addrspace(7) %p, i32 %i syncscope("wavefront") seq_cst, align 4 + %max = atomicrmw max ptr addrspace(7) %p, i32 %i syncscope("wavefront") seq_cst, align 4 + %umin = atomicrmw umin ptr addrspace(7) %p, i32 %i syncscope("wavefront") seq_cst, align 4 + %umax = atomicrmw umax ptr addrspace(7) %p, i32 %i syncscope("wavefront") seq_cst, align 4 + + %fadd = atomicrmw fadd ptr addrspace(7) %p, float %f syncscope("wavefront") seq_cst, align 4 + %fmax = atomicrmw fmax ptr addrspace(7) %p, float %f syncscope("wavefront") seq_cst, align 4 + %fmin = atomicrmw fmin ptr addrspace(7) %p, float %f syncscope("wavefront") seq_cst, align 4 + + ; Check a no-return atomic + atomicrmw add ptr addrspace(7) %p, i32 %i syncscope("wavefront") seq_cst, align 4 + + ret void +} + +define {i32, i1} @cmpxchg(ptr addrspace(8) %buf, i32 %wanted, i32 %new) { +; CHECK-LABEL: define { i32, i1 } @cmpxchg +; CHECK-SAME: (ptr addrspace(8) [[BUF:%.*]], i32 [[WANTED:%.*]], i32 [[NEW:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: fence syncscope("wavefront") release +; CHECK-NEXT: [[RET:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i32(i32 [[NEW]], i32 [[WANTED]], ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 0), !amdgpu.align !0, !amdgpu.volatile !4 +; CHECK-NEXT: fence syncscope("wavefront") acquire +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { i32, i1 } poison, i32 [[RET]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[RET]], [[WANTED]] +; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { i32, i1 } [[TMP1]], i1 [[TMP2]], 1 +; CHECK-NEXT: ret { i32, i1 } [[TMP3]] +; + %base = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %p = getelementptr i32, ptr addrspace(7) %base, i32 4 + + %ret = cmpxchg volatile ptr addrspace(7) %p, i32 %wanted, i32 %new syncscope("wavefront") acq_rel monotonic, align 4 + ret {i32, i1} %ret +} + +define {i32, i1} @cmpxchg_weak(ptr addrspace(8) %buf, i32 %wanted, i32 %new) { +; CHECK-LABEL: define { i32, i1 } @cmpxchg_weak +; CHECK-SAME: (ptr addrspace(8) [[BUF:%.*]], i32 [[WANTED:%.*]], i32 [[NEW:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: fence syncscope("wavefront") release +; CHECK-NEXT: [[RET:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i32(i32 [[NEW]], i32 [[WANTED]], ptr addrspace(8) [[BUF]], i32 16, i32 0, i32 0), !amdgpu.align !0 +; CHECK-NEXT: fence syncscope("wavefront") acquire +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { i32, i1 } poison, i32 [[RET]], 0 +; CHECK-NEXT: ret { i32, i1 } [[TMP1]] +; + %base = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + %p = getelementptr i32, ptr addrspace(7) %base, i32 4 + + %ret = cmpxchg weak ptr addrspace(7) %p, i32 %wanted, i32 %new syncscope("wavefront") acq_rel monotonic, align 4 + ret {i32, i1} %ret +} + +!0 = ! { i32 1 } +!1 = ! { } diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-p7-in-memory.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-p7-in-memory.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-p7-in-memory.ll @@ -0,0 +1,154 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -S -mcpu=gfx900 -amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s +; RUN: opt -S -mcpu=gfx900 -passes=amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8" +target triple = "amdgcn--" + +define void @scalar_copy(ptr %a, ptr %b) { +; CHECK-LABEL: define void @scalar_copy +; CHECK-SAME: (ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[X:%.*]] = load i160, ptr [[A]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i160 [[X]], 32 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i160 [[TMP1]] to i128 +; CHECK-NEXT: [[X_PTR_RSRC:%.*]] = inttoptr i128 [[TMP2]] to ptr addrspace(8) +; CHECK-NEXT: [[X_PTR_OFF:%.*]] = trunc i160 [[X]] to i32 +; CHECK-NEXT: [[B1:%.*]] = getelementptr i160, ptr [[B]], i64 1 +; CHECK-NEXT: [[X_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[X_PTR_RSRC]] to i160 +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i160 [[X_PTR_INT_RSRC]], 32 +; CHECK-NEXT: [[X_PTR_INT_OFF:%.*]] = zext i32 [[X_PTR_OFF]] to i160 +; CHECK-NEXT: [[X_PTR_INT:%.*]] = or i160 [[TMP3]], [[X_PTR_INT_OFF]] +; CHECK-NEXT: store i160 [[X_PTR_INT]], ptr [[B1]], align 32 +; CHECK-NEXT: ret void +; + %x = load ptr addrspace(7), ptr %a + %b1 = getelementptr ptr addrspace(7), ptr %b, i64 1 + store ptr addrspace(7) %x, ptr %b1 + ret void +} + +define void @vector_copy(ptr %a, ptr %b) { +; CHECK-LABEL: define void @vector_copy +; CHECK-SAME: (ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[X:%.*]] = load <4 x i160>, ptr [[A]], align 128 +; CHECK-NEXT: [[TMP1:%.*]] = lshr <4 x i160> [[X]], +; CHECK-NEXT: [[TMP2:%.*]] = trunc <4 x i160> [[TMP1]] to <4 x i128> +; CHECK-NEXT: [[X_PTR_RSRC:%.*]] = inttoptr <4 x i128> [[TMP2]] to <4 x ptr addrspace(8)> +; CHECK-NEXT: [[X_PTR_OFF:%.*]] = trunc <4 x i160> [[X]] to <4 x i32> +; CHECK-NEXT: [[B1:%.*]] = getelementptr <4 x i160>, ptr [[B]], i64 2 +; CHECK-NEXT: [[X_PTR_INT_RSRC:%.*]] = ptrtoint <4 x ptr addrspace(8)> [[X_PTR_RSRC]] to <4 x i160> +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw <4 x i160> [[X_PTR_INT_RSRC]], +; CHECK-NEXT: [[X_PTR_INT_OFF:%.*]] = zext <4 x i32> [[X_PTR_OFF]] to <4 x i160> +; CHECK-NEXT: [[X_PTR_INT:%.*]] = or <4 x i160> [[TMP3]], [[X_PTR_INT_OFF]] +; CHECK-NEXT: store <4 x i160> [[X_PTR_INT]], ptr [[B1]], align 128 +; CHECK-NEXT: ret void +; + %x = load <4 x ptr addrspace(7)>, ptr %a + %b1 = getelementptr <4 x ptr addrspace(7)>, ptr %b, i64 2 + store <4 x ptr addrspace(7)> %x, ptr %b1 + ret void +} + +define void @alloca(ptr %a, ptr %b) { +; CHECK-LABEL: define void @alloca +; CHECK-SAME: (ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [5 x i160], align 32, addrspace(5) +; CHECK-NEXT: [[X:%.*]] = load i160, ptr [[A]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i160 [[X]], 32 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i160 [[TMP1]] to i128 +; CHECK-NEXT: [[X_PTR_RSRC:%.*]] = inttoptr i128 [[TMP2]] to ptr addrspace(8) +; CHECK-NEXT: [[X_PTR_OFF:%.*]] = trunc i160 [[X]] to i32 +; CHECK-NEXT: [[L:%.*]] = getelementptr i160, ptr addrspace(5) [[ALLOCA]], i32 1 +; CHECK-NEXT: [[X_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[X_PTR_RSRC]] to i160 +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i160 [[X_PTR_INT_RSRC]], 32 +; CHECK-NEXT: [[X_PTR_INT_OFF:%.*]] = zext i32 [[X_PTR_OFF]] to i160 +; CHECK-NEXT: [[X_PTR_INT:%.*]] = or i160 [[TMP3]], [[X_PTR_INT_OFF]] +; CHECK-NEXT: store i160 [[X_PTR_INT]], ptr addrspace(5) [[L]], align 32 +; CHECK-NEXT: [[Y:%.*]] = load i160, ptr addrspace(5) [[L]], align 32 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i160 [[Y]], 32 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i160 [[TMP4]] to i128 +; CHECK-NEXT: [[Y_PTR_RSRC:%.*]] = inttoptr i128 [[TMP5]] to ptr addrspace(8) +; CHECK-NEXT: [[Y_PTR_OFF:%.*]] = trunc i160 [[Y]] to i32 +; CHECK-NEXT: [[Y_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[Y_PTR_RSRC]] to i160 +; CHECK-NEXT: [[TMP6:%.*]] = shl nuw i160 [[Y_PTR_INT_RSRC]], 32 +; CHECK-NEXT: [[Y_PTR_INT_OFF:%.*]] = zext i32 [[Y_PTR_OFF]] to i160 +; CHECK-NEXT: [[Y_PTR_INT:%.*]] = or i160 [[TMP6]], [[Y_PTR_INT_OFF]] +; CHECK-NEXT: store i160 [[Y_PTR_INT]], ptr [[B]], align 32 +; CHECK-NEXT: ret void +; + %alloca = alloca [5 x ptr addrspace(7)], addrspace(5) + %x = load ptr addrspace(7), ptr %a + %l = getelementptr ptr addrspace(7), ptr addrspace(5) %alloca, i32 1 + store ptr addrspace(7) %x, ptr addrspace(5) %l + %y = load ptr addrspace(7), ptr addrspace(5) %l + store ptr addrspace(7) %y, ptr %b + ret void +} + +define void @complex_copy(ptr %a, ptr %b) { +; CHECK-LABEL: define void @complex_copy +; CHECK-SAME: (ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[X:%.*]] = load { [2 x i160], i32, i160 }, ptr [[A]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { [2 x i160], i32, i160 } [[X]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue [2 x i160] [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = lshr i160 [[TMP2]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i160 [[TMP3]] to i128 +; CHECK-NEXT: [[X_0_0_PTR_RSRC:%.*]] = inttoptr i128 [[TMP4]] to ptr addrspace(8) +; CHECK-NEXT: [[X_0_0_PTR_OFF:%.*]] = trunc i160 [[TMP2]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[X_0_0_PTR_RSRC]], 0 +; CHECK-NEXT: [[X_0_0_PTR:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP5]], i32 [[X_0_0_PTR_OFF]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertvalue [2 x { ptr addrspace(8), i32 }] poison, { ptr addrspace(8), i32 } [[X_0_0_PTR]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue [2 x i160] [[TMP1]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = lshr i160 [[TMP7]], 32 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i160 [[TMP8]] to i128 +; CHECK-NEXT: [[X_0_1_PTR_RSRC:%.*]] = inttoptr i128 [[TMP9]] to ptr addrspace(8) +; CHECK-NEXT: [[X_0_1_PTR_OFF:%.*]] = trunc i160 [[TMP7]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[X_0_1_PTR_RSRC]], 0 +; CHECK-NEXT: [[X_0_1_PTR:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP10]], i32 [[X_0_1_PTR_OFF]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertvalue [2 x { ptr addrspace(8), i32 }] [[TMP6]], { ptr addrspace(8), i32 } [[X_0_1_PTR]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = insertvalue { [2 x { ptr addrspace(8), i32 }], i32, { ptr addrspace(8), i32 } } poison, [2 x { ptr addrspace(8), i32 }] [[TMP11]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { [2 x i160], i32, i160 } [[X]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = insertvalue { [2 x { ptr addrspace(8), i32 }], i32, { ptr addrspace(8), i32 } } [[TMP12]], i32 [[TMP13]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { [2 x i160], i32, i160 } [[X]], 2 +; CHECK-NEXT: [[TMP16:%.*]] = lshr i160 [[TMP15]], 32 +; CHECK-NEXT: [[TMP17:%.*]] = trunc i160 [[TMP16]] to i128 +; CHECK-NEXT: [[X_2_PTR_RSRC:%.*]] = inttoptr i128 [[TMP17]] to ptr addrspace(8) +; CHECK-NEXT: [[X_2_PTR_OFF:%.*]] = trunc i160 [[TMP15]] to i32 +; CHECK-NEXT: [[TMP18:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[X_2_PTR_RSRC]], 0 +; CHECK-NEXT: [[X_2_PTR:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP18]], i32 [[X_2_PTR_OFF]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertvalue { [2 x { ptr addrspace(8), i32 }], i32, { ptr addrspace(8), i32 } } [[TMP14]], { ptr addrspace(8), i32 } [[X_2_PTR]], 2 +; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { [2 x { ptr addrspace(8), i32 }], i32, { ptr addrspace(8), i32 } } [[TMP19]], 0 +; CHECK-NEXT: [[TMP21:%.*]] = extractvalue [2 x { ptr addrspace(8), i32 }] [[TMP20]], 0 +; CHECK-NEXT: [[DOTRSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[TMP21]], 0 +; CHECK-NEXT: [[DOTOFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[TMP21]], 1 +; CHECK-NEXT: [[DOT0_0_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[DOTRSRC]] to i160 +; CHECK-NEXT: [[TMP22:%.*]] = shl nuw i160 [[DOT0_0_INT_RSRC]], 32 +; CHECK-NEXT: [[DOT0_0_INT_OFF:%.*]] = zext i32 [[DOTOFF]] to i160 +; CHECK-NEXT: [[DOT0_0_INT:%.*]] = or i160 [[TMP22]], [[DOT0_0_INT_OFF]] +; CHECK-NEXT: [[TMP23:%.*]] = insertvalue [2 x i160] poison, i160 [[DOT0_0_INT]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = extractvalue [2 x { ptr addrspace(8), i32 }] [[TMP20]], 1 +; CHECK-NEXT: [[DOTRSRC1:%.*]] = extractvalue { ptr addrspace(8), i32 } [[TMP24]], 0 +; CHECK-NEXT: [[DOTOFF2:%.*]] = extractvalue { ptr addrspace(8), i32 } [[TMP24]], 1 +; CHECK-NEXT: [[DOT0_1_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[DOTRSRC1]] to i160 +; CHECK-NEXT: [[TMP25:%.*]] = shl nuw i160 [[DOT0_1_INT_RSRC]], 32 +; CHECK-NEXT: [[DOT0_1_INT_OFF:%.*]] = zext i32 [[DOTOFF2]] to i160 +; CHECK-NEXT: [[DOT0_1_INT:%.*]] = or i160 [[TMP25]], [[DOT0_1_INT_OFF]] +; CHECK-NEXT: [[TMP26:%.*]] = insertvalue [2 x i160] [[TMP23]], i160 [[DOT0_1_INT]], 1 +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { [2 x i160], i32, i160 } poison, [2 x i160] [[TMP26]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = extractvalue { [2 x { ptr addrspace(8), i32 }], i32, { ptr addrspace(8), i32 } } [[TMP19]], 1 +; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { [2 x i160], i32, i160 } [[TMP27]], i32 [[TMP28]], 1 +; CHECK-NEXT: [[TMP30:%.*]] = extractvalue { [2 x { ptr addrspace(8), i32 }], i32, { ptr addrspace(8), i32 } } [[TMP19]], 2 +; CHECK-NEXT: [[DOTRSRC3:%.*]] = extractvalue { ptr addrspace(8), i32 } [[TMP30]], 0 +; CHECK-NEXT: [[DOTOFF4:%.*]] = extractvalue { ptr addrspace(8), i32 } [[TMP30]], 1 +; CHECK-NEXT: [[DOT2_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[DOTRSRC3]] to i160 +; CHECK-NEXT: [[TMP31:%.*]] = shl nuw i160 [[DOT2_INT_RSRC]], 32 +; CHECK-NEXT: [[DOT2_INT_OFF:%.*]] = zext i32 [[DOTOFF4]] to i160 +; CHECK-NEXT: [[DOT2_INT:%.*]] = or i160 [[TMP31]], [[DOT2_INT_OFF]] +; CHECK-NEXT: [[TMP32:%.*]] = insertvalue { [2 x i160], i32, i160 } [[TMP29]], i160 [[DOT2_INT]], 2 +; CHECK-NEXT: store { [2 x i160], i32, i160 } [[TMP32]], ptr [[B]], align 32 +; CHECK-NEXT: ret void +; + %x = load {[2 x ptr addrspace(7)], i32, ptr addrspace(7)}, ptr %a + store {[2 x ptr addrspace(7)], i32, ptr addrspace(7)} %x, ptr %b + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll @@ -0,0 +1,411 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -S -mcpu=gfx900 -amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s +; RUN: opt -S -mcpu=gfx900 -passes=amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8" +target triple = "amdgcn--" + +define ptr addrspace(7) @gep(ptr addrspace(7) %in, i32 %idx) { +; CHECK-LABEL: define { ptr addrspace(8), i32 } @gep +; CHECK-SAME: ({ ptr addrspace(8), i32 } [[IN:%.*]], i32 [[IDX:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[IN_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[IN]], 0 +; CHECK-NEXT: [[IN_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[IN]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i32 [[IDX]], 40 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 32 +; CHECK-NEXT: [[RET:%.*]] = add i32 [[IN_OFF]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[IN_RSRC]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP3]], i32 [[RET]], 1 +; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[TMP4]] +; + %ret = getelementptr inbounds {i32, [4 x ptr]}, ptr addrspace(7) %in, i32 %idx, i32 1, i32 3 + ret ptr addrspace(7) %ret +} + +define <2 x ptr addrspace(7)> @gep_vectors(<2 x ptr addrspace(7)> %in, <2 x i32> %idx) { +; CHECK-LABEL: define { <2 x ptr addrspace(8)>, <2 x i32> } @gep_vectors +; CHECK-SAME: ({ <2 x ptr addrspace(8)>, <2 x i32> } [[IN:%.*]], <2 x i32> [[IDX:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[IN_RSRC:%.*]] = extractvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[IN]], 0 +; CHECK-NEXT: [[IN_OFF:%.*]] = extractvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[IN]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw <2 x i32> [[IDX]], +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw <2 x i32> [[TMP1]], +; CHECK-NEXT: [[RET:%.*]] = add <2 x i32> [[IN_OFF]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { <2 x ptr addrspace(8)>, <2 x i32> } poison, <2 x ptr addrspace(8)> [[IN_RSRC]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[TMP3]], <2 x i32> [[RET]], 1 +; CHECK-NEXT: ret { <2 x ptr addrspace(8)>, <2 x i32> } [[TMP4]] +; + %ret = getelementptr inbounds {i32, [4 x ptr]}, <2 x ptr addrspace(7)> %in, <2 x i32> %idx, i32 1, i32 3 + ret <2 x ptr addrspace(7)> %ret +} + +define <2 x ptr addrspace(7)> @gep_vector_scalar(<2 x ptr addrspace(7)> %in, i64 %idx) { +; CHECK-LABEL: define { <2 x ptr addrspace(8)>, <2 x i32> } @gep_vector_scalar +; CHECK-SAME: ({ <2 x ptr addrspace(8)>, <2 x i32> } [[IN:%.*]], i64 [[IDX:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[IN_RSRC:%.*]] = extractvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[IN]], 0 +; CHECK-NEXT: [[IN_OFF:%.*]] = extractvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[IN]], 1 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[IDX]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i64> [[DOTSPLAT]] to <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw <2 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw <2 x i32> [[TMP2]], +; CHECK-NEXT: [[RET:%.*]] = add <2 x i32> [[IN_OFF]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = insertvalue { <2 x ptr addrspace(8)>, <2 x i32> } poison, <2 x ptr addrspace(8)> [[IN_RSRC]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[TMP4]], <2 x i32> [[RET]], 1 +; CHECK-NEXT: ret { <2 x ptr addrspace(8)>, <2 x i32> } [[TMP5]] +; + %ret = getelementptr inbounds {i32, [4 x ptr]}, <2 x ptr addrspace(7)> %in, i64 %idx, i32 1, i32 3 + ret <2 x ptr addrspace(7)> %ret +} + +define i160 @ptrtoint(ptr addrspace(7) %ptr) { +; CHECK-LABEL: define i160 @ptrtoint +; CHECK-SAME: ({ ptr addrspace(8), i32 } [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 +; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 +; CHECK-NEXT: [[RET_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[PTR_RSRC]] to i160 +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i160 [[RET_RSRC]], 32 +; CHECK-NEXT: [[RET_OFF:%.*]] = zext i32 [[PTR_OFF]] to i160 +; CHECK-NEXT: [[RET:%.*]] = or i160 [[TMP1]], [[RET_OFF]] +; CHECK-NEXT: ret i160 [[RET]] +; + %ret = ptrtoint ptr addrspace(7) %ptr to i160 + ret i160 %ret +} + +define <2 x i160> @ptrtoint_vec(<2 x ptr addrspace(7)> %ptr) { +; CHECK-LABEL: define <2 x i160> @ptrtoint_vec +; CHECK-SAME: ({ <2 x ptr addrspace(8)>, <2 x i32> } [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[PTR]], 0 +; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[PTR]], 1 +; CHECK-NEXT: [[RET_RSRC:%.*]] = ptrtoint <2 x ptr addrspace(8)> [[PTR_RSRC]] to <2 x i160> +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw <2 x i160> [[RET_RSRC]], +; CHECK-NEXT: [[RET_OFF:%.*]] = zext <2 x i32> [[PTR_OFF]] to <2 x i160> +; CHECK-NEXT: [[RET:%.*]] = or <2 x i160> [[TMP1]], [[RET_OFF]] +; CHECK-NEXT: ret <2 x i160> [[RET]] +; + %ret = ptrtoint <2 x ptr addrspace(7)> %ptr to <2 x i160> + ret <2 x i160> %ret +} + +define i256 @ptrtoint_long(ptr addrspace(7) %ptr) { +; CHECK-LABEL: define i256 @ptrtoint_long +; CHECK-SAME: ({ ptr addrspace(8), i32 } [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 +; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 +; CHECK-NEXT: [[RET_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[PTR_RSRC]] to i256 +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i256 [[RET_RSRC]], 32 +; CHECK-NEXT: [[RET_OFF:%.*]] = zext i32 [[PTR_OFF]] to i256 +; CHECK-NEXT: [[RET:%.*]] = or i256 [[TMP1]], [[RET_OFF]] +; CHECK-NEXT: ret i256 [[RET]] +; + %ret = ptrtoint ptr addrspace(7) %ptr to i256 + ret i256 %ret +} + +define i64 @ptrtoint_short(ptr addrspace(7) %ptr) { +; CHECK-LABEL: define i64 @ptrtoint_short +; CHECK-SAME: ({ ptr addrspace(8), i32 } [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 +; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 +; CHECK-NEXT: [[RET_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[PTR_RSRC]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[RET_RSRC]], 32 +; CHECK-NEXT: [[RET_OFF:%.*]] = zext i32 [[PTR_OFF]] to i64 +; CHECK-NEXT: [[RET:%.*]] = or i64 [[TMP1]], [[RET_OFF]] +; CHECK-NEXT: ret i64 [[RET]] +; + %ret = ptrtoint ptr addrspace(7) %ptr to i64 + ret i64 %ret +} + +define i32 @ptrtoint_offset(ptr addrspace(7) %ptr) { +; CHECK-LABEL: define i32 @ptrtoint_offset +; CHECK-SAME: ({ ptr addrspace(8), i32 } [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 +; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 +; CHECK-NEXT: [[RET:%.*]] = or i32 poison, [[PTR_OFF]] +; CHECK-NEXT: ret i32 [[RET]] +; + %ret = ptrtoint ptr addrspace(7) %ptr to i32 + ret i32 %ret +} + +define ptr addrspace(7) @inttoptr(i160 %v) { +; CHECK-LABEL: define { ptr addrspace(8), i32 } @inttoptr +; CHECK-SAME: (i160 [[V:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = lshr i160 [[V]], 32 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i160 [[TMP1]] to i128 +; CHECK-NEXT: [[RET_RSRC:%.*]] = inttoptr i128 [[TMP2]] to ptr addrspace(8) +; CHECK-NEXT: [[RET_OFF:%.*]] = trunc i160 [[V]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[RET_RSRC]], 0 +; CHECK-NEXT: [[RET:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP3]], i32 [[RET_OFF]], 1 +; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[RET]] +; + %ret = inttoptr i160 %v to ptr addrspace(7) + ret ptr addrspace(7) %ret +} + +define <2 x ptr addrspace(7)> @inttoptr_vec(<2 x i160> %v) { +; CHECK-LABEL: define { <2 x ptr addrspace(8)>, <2 x i32> } @inttoptr_vec +; CHECK-SAME: (<2 x i160> [[V:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = lshr <2 x i160> [[V]], +; CHECK-NEXT: [[TMP2:%.*]] = trunc <2 x i160> [[TMP1]] to <2 x i128> +; CHECK-NEXT: [[RET_RSRC:%.*]] = inttoptr <2 x i128> [[TMP2]] to <2 x ptr addrspace(8)> +; CHECK-NEXT: [[RET_OFF:%.*]] = trunc <2 x i160> [[V]] to <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { <2 x ptr addrspace(8)>, <2 x i32> } poison, <2 x ptr addrspace(8)> [[RET_RSRC]], 0 +; CHECK-NEXT: [[RET:%.*]] = insertvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[TMP3]], <2 x i32> [[RET_OFF]], 1 +; CHECK-NEXT: ret { <2 x ptr addrspace(8)>, <2 x i32> } [[RET]] +; + %ret = inttoptr <2 x i160> %v to <2 x ptr addrspace(7)> + ret <2 x ptr addrspace(7)> %ret +} + +define ptr addrspace(7) @inttoptr_long(i256 %v) { +; CHECK-LABEL: define { ptr addrspace(8), i32 } @inttoptr_long +; CHECK-SAME: (i256 [[V:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = lshr i256 [[V]], 32 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i256 [[TMP1]] to i128 +; CHECK-NEXT: [[RET_RSRC:%.*]] = inttoptr i128 [[TMP2]] to ptr addrspace(8) +; CHECK-NEXT: [[RET_OFF:%.*]] = trunc i256 [[V]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[RET_RSRC]], 0 +; CHECK-NEXT: [[RET:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP3]], i32 [[RET_OFF]], 1 +; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[RET]] +; + %ret = inttoptr i256 %v to ptr addrspace(7) + ret ptr addrspace(7) %ret +} + +define ptr addrspace(7) @inttoptr_offset(i32 %v) { +; CHECK-LABEL: define { ptr addrspace(8), i32 } @inttoptr_offset +; CHECK-SAME: (i32 [[V:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[V]], 32 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i128 +; CHECK-NEXT: [[RET_RSRC:%.*]] = inttoptr i128 [[TMP2]] to ptr addrspace(8) +; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[RET_RSRC]], 0 +; CHECK-NEXT: [[RET:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP3]], i32 [[V]], 1 +; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[RET]] +; + %ret = inttoptr i32 %v to ptr addrspace(7) + ret ptr addrspace(7) %ret +} + +define ptr addrspace(7) @addrspacecast(ptr addrspace(8) %buf) { +; CHECK-LABEL: define { ptr addrspace(8), i32 } @addrspacecast +; CHECK-SAME: (ptr addrspace(8) [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[BUF]], 0 +; CHECK-NEXT: [[RET:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 0, 1 +; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[RET]] +; + %ret = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) + ret ptr addrspace(7) %ret +} + +define <2 x ptr addrspace(7)> @addrspacecast_vec(<2 x ptr addrspace(8)> %buf) { +; CHECK-LABEL: define { <2 x ptr addrspace(8)>, <2 x i32> } @addrspacecast_vec +; CHECK-SAME: (<2 x ptr addrspace(8)> [[BUF:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { <2 x ptr addrspace(8)>, <2 x i32> } poison, <2 x ptr addrspace(8)> [[BUF]], 0 +; CHECK-NEXT: [[RET:%.*]] = insertvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[TMP1]], <2 x i32> zeroinitializer, 1 +; CHECK-NEXT: ret { <2 x ptr addrspace(8)>, <2 x i32> } [[RET]] +; + %ret = addrspacecast <2 x ptr addrspace(8)> %buf to <2 x ptr addrspace(7)> + ret <2 x ptr addrspace(7)> %ret +} + +define i1 @icmp_eq(ptr addrspace(7) %a, ptr addrspace(7) %b) { +; CHECK-LABEL: define i1 @icmp_eq +; CHECK-SAME: ({ ptr addrspace(8), i32 } [[A:%.*]], { ptr addrspace(8), i32 } [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[B_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[B]], 0 +; CHECK-NEXT: [[B_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[B]], 1 +; CHECK-NEXT: [[A_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[A]], 0 +; CHECK-NEXT: [[A_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[A]], 1 +; CHECK-NEXT: [[RET_RSRC:%.*]] = icmp eq ptr addrspace(8) [[A_RSRC]], [[B_RSRC]] +; CHECK-NEXT: [[RET_OFF:%.*]] = icmp eq i32 [[A_OFF]], [[B_OFF]] +; CHECK-NEXT: [[RET:%.*]] = and i1 [[RET_RSRC]], [[RET_OFF]] +; CHECK-NEXT: ret i1 [[RET]] +; + %ret = icmp eq ptr addrspace(7) %a, %b + ret i1 %ret +} + +define i1 @icmp_ne(ptr addrspace(7) %a, ptr addrspace(7) %b) { +; CHECK-LABEL: define i1 @icmp_ne +; CHECK-SAME: ({ ptr addrspace(8), i32 } [[A:%.*]], { ptr addrspace(8), i32 } [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[B_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[B]], 0 +; CHECK-NEXT: [[B_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[B]], 1 +; CHECK-NEXT: [[A_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[A]], 0 +; CHECK-NEXT: [[A_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[A]], 1 +; CHECK-NEXT: [[RET_RSRC:%.*]] = icmp ne ptr addrspace(8) [[A_RSRC]], [[B_RSRC]] +; CHECK-NEXT: [[RET_OFF:%.*]] = icmp ne i32 [[A_OFF]], [[B_OFF]] +; CHECK-NEXT: [[RET:%.*]] = or i1 [[RET_RSRC]], [[RET_OFF]] +; CHECK-NEXT: ret i1 [[RET]] +; + %ret = icmp ne ptr addrspace(7) %a, %b + ret i1 %ret +} + +define <2 x i1> @icmp_eq_vec(<2 x ptr addrspace(7)> %a, <2 x ptr addrspace(7)> %b) { +; CHECK-LABEL: define <2 x i1> @icmp_eq_vec +; CHECK-SAME: ({ <2 x ptr addrspace(8)>, <2 x i32> } [[A:%.*]], { <2 x ptr addrspace(8)>, <2 x i32> } [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[B_RSRC:%.*]] = extractvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[B]], 0 +; CHECK-NEXT: [[B_OFF:%.*]] = extractvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[B]], 1 +; CHECK-NEXT: [[A_RSRC:%.*]] = extractvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[A]], 0 +; CHECK-NEXT: [[A_OFF:%.*]] = extractvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[A]], 1 +; CHECK-NEXT: [[RET_RSRC:%.*]] = icmp eq <2 x ptr addrspace(8)> [[A_RSRC]], [[B_RSRC]] +; CHECK-NEXT: [[RET_OFF:%.*]] = icmp eq <2 x i32> [[A_OFF]], [[B_OFF]] +; CHECK-NEXT: [[RET:%.*]] = and <2 x i1> [[RET_RSRC]], [[RET_OFF]] +; CHECK-NEXT: ret <2 x i1> [[RET]] +; + %ret = icmp eq <2 x ptr addrspace(7)> %a, %b + ret <2 x i1> %ret +} + +define <2 x i1> @icmp_ne_vec(<2 x ptr addrspace(7)> %a, <2 x ptr addrspace(7)> %b) { +; CHECK-LABEL: define <2 x i1> @icmp_ne_vec +; CHECK-SAME: ({ <2 x ptr addrspace(8)>, <2 x i32> } [[A:%.*]], { <2 x ptr addrspace(8)>, <2 x i32> } [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[B_RSRC:%.*]] = extractvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[B]], 0 +; CHECK-NEXT: [[B_OFF:%.*]] = extractvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[B]], 1 +; CHECK-NEXT: [[A_RSRC:%.*]] = extractvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[A]], 0 +; CHECK-NEXT: [[A_OFF:%.*]] = extractvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[A]], 1 +; CHECK-NEXT: [[RET_RSRC:%.*]] = icmp ne <2 x ptr addrspace(8)> [[A_RSRC]], [[B_RSRC]] +; CHECK-NEXT: [[RET_OFF:%.*]] = icmp ne <2 x i32> [[A_OFF]], [[B_OFF]] +; CHECK-NEXT: [[RET:%.*]] = or <2 x i1> [[RET_RSRC]], [[RET_OFF]] +; CHECK-NEXT: ret <2 x i1> [[RET]] +; + %ret = icmp ne <2 x ptr addrspace(7)> %a, %b + ret <2 x i1> %ret +} + +define ptr addrspace(7) @freeze(ptr addrspace(7) %p) { +; CHECK-LABEL: define { ptr addrspace(8), i32 } @freeze +; CHECK-SAME: ({ ptr addrspace(8), i32 } [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[P_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[P]], 0 +; CHECK-NEXT: [[P_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[P]], 1 +; CHECK-NEXT: [[RET_RSRC:%.*]] = freeze ptr addrspace(8) [[P_RSRC]] +; CHECK-NEXT: [[RET_OFF:%.*]] = freeze i32 [[P_OFF]] +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[RET_RSRC]], 0 +; CHECK-NEXT: [[RET:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[RET_OFF]], 1 +; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[RET]] +; + %ret = freeze ptr addrspace(7) %p + ret ptr addrspace(7) %ret +} + +define <2 x ptr addrspace(7)> @freeze_vec(<2 x ptr addrspace(7)> %p) { +; CHECK-LABEL: define { <2 x ptr addrspace(8)>, <2 x i32> } @freeze_vec +; CHECK-SAME: ({ <2 x ptr addrspace(8)>, <2 x i32> } [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[P_RSRC:%.*]] = extractvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[P]], 0 +; CHECK-NEXT: [[P_OFF:%.*]] = extractvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[P]], 1 +; CHECK-NEXT: [[RET_RSRC:%.*]] = freeze <2 x ptr addrspace(8)> [[P_RSRC]] +; CHECK-NEXT: [[RET_OFF:%.*]] = freeze <2 x i32> [[P_OFF]] +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { <2 x ptr addrspace(8)>, <2 x i32> } poison, <2 x ptr addrspace(8)> [[RET_RSRC]], 0 +; CHECK-NEXT: [[RET:%.*]] = insertvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[TMP1]], <2 x i32> [[RET_OFF]], 1 +; CHECK-NEXT: ret { <2 x ptr addrspace(8)>, <2 x i32> } [[RET]] +; + %ret = freeze <2 x ptr addrspace(7)> %p + ret <2 x ptr addrspace(7)> %ret +} + +define ptr addrspace(7) @extractelement(<2 x ptr addrspace(7)> %v, i32 %i) { +; CHECK-LABEL: define { ptr addrspace(8), i32 } @extractelement +; CHECK-SAME: ({ <2 x ptr addrspace(8)>, <2 x i32> } [[V:%.*]], i32 [[I:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[V_RSRC:%.*]] = extractvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[V]], 0 +; CHECK-NEXT: [[V_OFF:%.*]] = extractvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[V]], 1 +; CHECK-NEXT: [[RET_RSRC:%.*]] = extractelement <2 x ptr addrspace(8)> [[V_RSRC]], i32 [[I]] +; CHECK-NEXT: [[RET_OFF:%.*]] = extractelement <2 x i32> [[V_OFF]], i32 [[I]] +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[RET_RSRC]], 0 +; CHECK-NEXT: [[RET:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[RET_OFF]], 1 +; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[RET]] +; + %ret = extractelement <2 x ptr addrspace(7)> %v, i32 %i + ret ptr addrspace(7) %ret +} + +define <2 x ptr addrspace(7)> @insertelement(<2 x ptr addrspace(7)> %v, ptr addrspace(7) %s, i32 %i) { +; CHECK-LABEL: define { <2 x ptr addrspace(8)>, <2 x i32> } @insertelement +; CHECK-SAME: ({ <2 x ptr addrspace(8)>, <2 x i32> } [[V:%.*]], { ptr addrspace(8), i32 } [[S:%.*]], i32 [[I:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[S_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[S]], 0 +; CHECK-NEXT: [[S_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[S]], 1 +; CHECK-NEXT: [[V_RSRC:%.*]] = extractvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[V]], 0 +; CHECK-NEXT: [[V_OFF:%.*]] = extractvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[V]], 1 +; CHECK-NEXT: [[RET_RSRC:%.*]] = insertelement <2 x ptr addrspace(8)> [[V_RSRC]], ptr addrspace(8) [[S_RSRC]], i32 [[I]] +; CHECK-NEXT: [[RET_OFF:%.*]] = insertelement <2 x i32> [[V_OFF]], i32 [[S_OFF]], i32 [[I]] +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { <2 x ptr addrspace(8)>, <2 x i32> } poison, <2 x ptr addrspace(8)> [[RET_RSRC]], 0 +; CHECK-NEXT: [[RET:%.*]] = insertvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[TMP1]], <2 x i32> [[RET_OFF]], 1 +; CHECK-NEXT: ret { <2 x ptr addrspace(8)>, <2 x i32> } [[RET]] +; + %ret = insertelement <2 x ptr addrspace(7)> %v, ptr addrspace(7) %s, i32 %i + ret <2 x ptr addrspace(7)> %ret +} + +define <4 x ptr addrspace(7)> @shufflenvector(<2 x ptr addrspace(7)> %a, <2 x ptr addrspace(7)> %b) { +; CHECK-LABEL: define { <4 x ptr addrspace(8)>, <4 x i32> } @shufflenvector +; CHECK-SAME: ({ <2 x ptr addrspace(8)>, <2 x i32> } [[A:%.*]], { <2 x ptr addrspace(8)>, <2 x i32> } [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[B_RSRC:%.*]] = extractvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[B]], 0 +; CHECK-NEXT: [[B_OFF:%.*]] = extractvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[B]], 1 +; CHECK-NEXT: [[A_RSRC:%.*]] = extractvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[A]], 0 +; CHECK-NEXT: [[A_OFF:%.*]] = extractvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[A]], 1 +; CHECK-NEXT: [[RET_RSRC:%.*]] = shufflevector <2 x ptr addrspace(8)> [[A_RSRC]], <2 x ptr addrspace(8)> [[B_RSRC]], <4 x i32> +; CHECK-NEXT: [[RET_OFF:%.*]] = shufflevector <2 x i32> [[A_OFF]], <2 x i32> [[B_OFF]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { <4 x ptr addrspace(8)>, <4 x i32> } poison, <4 x ptr addrspace(8)> [[RET_RSRC]], 0 +; CHECK-NEXT: [[RET:%.*]] = insertvalue { <4 x ptr addrspace(8)>, <4 x i32> } [[TMP1]], <4 x i32> [[RET_OFF]], 1 +; CHECK-NEXT: ret { <4 x ptr addrspace(8)>, <4 x i32> } [[RET]] +; + %ret = shufflevector <2 x ptr addrspace(7)> %a, <2 x ptr addrspace(7)> %b, <4 x i32> + ret <4 x ptr addrspace(7)> %ret +} + +declare ptr addrspace(7) @llvm.ptrmask.p7.i160(ptr addrspace(7), i160) + +define ptr addrspace(7) @ptrmask(ptr addrspace(7) %p, i160 %mask) { +; CHECK-LABEL: define { ptr addrspace(8), i32 } @ptrmask +; CHECK-SAME: ({ ptr addrspace(8), i32 } [[P:%.*]], i160 [[MASK:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[P_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[P]], 0 +; CHECK-NEXT: [[P_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[P]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i160 [[MASK]], 32 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i160 [[TMP1]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i160 [[MASK]] to i32 +; CHECK-NEXT: [[RET_RSRC:%.*]] = call ptr addrspace(8) @llvm.ptrmask.p8.i128(ptr addrspace(8) [[P_RSRC]], i128 [[TMP2]]) +; CHECK-NEXT: [[RET_OFF:%.*]] = and i32 [[P_OFF]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[RET_RSRC]], 0 +; CHECK-NEXT: [[RET:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP4]], i32 [[RET_OFF]], 1 +; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[RET]] +; + %ret = call ptr addrspace(7) @llvm.ptrmask.p7.i160(ptr addrspace(7) %p, i160 %mask) + ret ptr addrspace(7) %ret +} + +declare ptr @llvm.invariant.start.p7(i64, ptr addrspace(7) nocapture) +declare void @llvm.invariant.end.p7(ptr, i64, ptr addrspace(7) nocapture) + +define i32 @invariant_start_end(ptr addrspace(7) %p) { +; CHECK-LABEL: define i32 @invariant_start_end +; CHECK-SAME: ({ ptr addrspace(8), i32 } [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[P_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[P]], 0 +; CHECK-NEXT: [[P_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[P]], 1 +; CHECK-NEXT: [[INV:%.*]] = call ptr @llvm.invariant.start.p8(i64 256, ptr addrspace(8) [[P_RSRC]]) +; CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) [[P_RSRC]], i32 [[P_OFF]], i32 0, i32 0), !amdgpu.align !0 +; CHECK-NEXT: call void @llvm.invariant.end.p8(ptr [[INV]], i64 256, ptr addrspace(8) [[P_RSRC]]) +; CHECK-NEXT: ret i32 [[V]] +; + %inv = call ptr @llvm.invariant.start.p7(i64 256, ptr addrspace(7) %p) + %v = load i32, ptr addrspace(7) %p + call void @llvm.invariant.end.p7(ptr %inv, i64 256, ptr addrspace(7) %p) + ret i32 %v +} + +declare ptr addrspace(7) @llvm.launder.invariant.group.p7(ptr addrspace(7) nocapture) +declare ptr addrspace(7) @llvm.strip.invariant.group.p7(ptr addrspace(7) nocapture) + +define ptr addrspace(7) @invariant_group(ptr addrspace(7) %p) { +; CHECK-LABEL: define { ptr addrspace(8), i32 } @invariant_group +; CHECK-SAME: ({ ptr addrspace(8), i32 } [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[P_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[P]], 0 +; CHECK-NEXT: [[P_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[P]], 1 +; CHECK-NEXT: [[LAUNDERED:%.*]] = call ptr addrspace(8) @llvm.launder.invariant.group.p8(ptr addrspace(8) [[P_RSRC]]) +; CHECK-NEXT: [[STRIPPED:%.*]] = call ptr addrspace(8) @llvm.strip.invariant.group.p8(ptr addrspace(8) [[LAUNDERED]]) +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[STRIPPED]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[P_OFF]], 1 +; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[TMP2]] +; + %laundered = call ptr addrspace(7) @llvm.launder.invariant.group.p7(ptr addrspace(7) %p) + %stripped = call ptr addrspace(7) @llvm.strip.invariant.group.p7(ptr addrspace(7) %laundered) + ret ptr addrspace(7) %stripped +} diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll @@ -0,0 +1,134 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -S -mcpu=gfx900 -amdgpu-lower-buffer-fat-pointers -check-debugify < %s | FileCheck %s +; RUN: opt -S -mcpu=gfx900 -passes=amdgpu-lower-buffer-fat-pointers,check-debugify < %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8" +target triple = "amdgcn--" + +define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace(8) %aux) !dbg !5 { +; CHECK-LABEL: define float @debug_stash_pointer +; CHECK-SAME: (ptr addrspace(8) [[BUF:%.*]], i32 [[IDX:%.*]], ptr addrspace(8) [[AUX:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG5:![0-9]+]] { +; CHECK-NEXT: [[BUF_PTR_VAR:%.*]] = alloca i160, align 32, addrspace(5), !dbg [[DBG21:![0-9]+]] +; CHECK-NEXT: call void @llvm.dbg.value(metadata ptr addrspace(5) [[BUF_PTR_VAR]], metadata [[META10:![0-9]+]], metadata !DIExpression()), !dbg [[DBG21]] +; CHECK-NEXT: [[AUX_PTR_VAR:%.*]] = alloca i160, align 32, addrspace(5), !dbg [[DBG22:![0-9]+]] +; CHECK-NEXT: call void @llvm.dbg.value(metadata ptr addrspace(5) [[AUX_PTR_VAR]], metadata [[META12:![0-9]+]], metadata !DIExpression()), !dbg [[DBG22]] +; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 0, metadata [[META13:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 128, 32)), !dbg [[DBG23:![0-9]+]] +; CHECK-NEXT: call void @llvm.dbg.value(metadata ptr addrspace(8) [[BUF]], metadata [[META13]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 128)), !dbg [[DBG24:![0-9]+]] +; CHECK-NEXT: [[BUF_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[BUF]] to i160, !dbg [[DBG23]] +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i160 [[BUF_PTR_INT_RSRC]], 32, !dbg [[DBG23]] +; CHECK-NEXT: [[BUF_PTR_INT:%.*]] = or i160 [[TMP1]], 0, !dbg [[DBG23]] +; CHECK-NEXT: store i160 [[BUF_PTR_INT]], ptr addrspace(5) [[BUF_PTR_VAR]], align 32, !dbg [[DBG23]] +; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 0, metadata [[META15:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 128, 32)), !dbg [[DBG25:![0-9]+]] +; CHECK-NEXT: call void @llvm.dbg.value(metadata ptr addrspace(8) [[AUX]], metadata [[META15]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 128)), !dbg [[DBG26:![0-9]+]] +; CHECK-NEXT: [[AUX_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[AUX]] to i160, !dbg [[DBG25]] +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw i160 [[AUX_PTR_INT_RSRC]], 32, !dbg [[DBG25]] +; CHECK-NEXT: [[AUX_PTR_INT:%.*]] = or i160 [[TMP2]], 0, !dbg [[DBG25]] +; CHECK-NEXT: store i160 [[AUX_PTR_INT]], ptr addrspace(5) [[AUX_PTR_VAR]], align 32, !dbg [[DBG25]] +; CHECK-NEXT: [[BUF_PTR_2:%.*]] = load i160, ptr addrspace(5) [[BUF_PTR_VAR]], align 32, !dbg [[DBG27:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i160 [[BUF_PTR_2]], 32, !dbg [[DBG27]] +; CHECK-NEXT: [[TMP4:%.*]] = trunc i160 [[TMP3]] to i128, !dbg [[DBG27]] +; CHECK-NEXT: [[BUF_PTR_2_PTR_RSRC:%.*]] = inttoptr i128 [[TMP4]] to ptr addrspace(8), !dbg [[DBG27]] +; CHECK-NEXT: [[BUF_PTR_2_PTR_OFF:%.*]] = trunc i160 [[BUF_PTR_2]] to i32, !dbg [[DBG27]] +; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 [[BUF_PTR_2_PTR_OFF]], metadata [[META16:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 128, 32)), !dbg [[DBG28:![0-9]+]] +; CHECK-NEXT: call void @llvm.dbg.value(metadata ptr addrspace(8) [[BUF_PTR_2_PTR_RSRC]], metadata [[META16]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 128)), !dbg [[DBG27]] +; CHECK-NEXT: [[TMP5:%.*]] = shl i32 [[IDX]], 32, !dbg [[DBG28]] +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 0, !dbg [[DBG28]] +; CHECK-NEXT: [[BUF_PTR_3:%.*]] = add i32 [[BUF_PTR_2_PTR_OFF]], [[TMP6]], !dbg [[DBG28]] +; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 [[BUF_PTR_3]], metadata [[META17:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 128, 32)), !dbg [[DBG29:![0-9]+]] +; CHECK-NEXT: call void @llvm.dbg.value(metadata ptr addrspace(8) [[BUF_PTR_2_PTR_RSRC]], metadata [[META17]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 128)), !dbg [[DBG28]] +; CHECK-NEXT: [[BUF_PTR_3_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[BUF_PTR_2_PTR_RSRC]] to i160, !dbg [[DBG29]] +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw i160 [[BUF_PTR_3_INT_RSRC]], 32, !dbg [[DBG29]] +; CHECK-NEXT: [[BUF_PTR_3_INT_OFF:%.*]] = zext i32 [[BUF_PTR_3]] to i160, !dbg [[DBG29]] +; CHECK-NEXT: [[BUF_PTR_3_INT:%.*]] = or i160 [[TMP7]], [[BUF_PTR_3_INT_OFF]], !dbg [[DBG29]] +; CHECK-NEXT: store i160 [[BUF_PTR_3_INT]], ptr addrspace(5) [[BUF_PTR_VAR]], align 32, !dbg [[DBG29]] +; CHECK-NEXT: [[BUF_PTR_4:%.*]] = load i160, ptr addrspace(5) [[BUF_PTR_VAR]], align 32, !dbg [[DBG30:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = lshr i160 [[BUF_PTR_4]], 32, !dbg [[DBG30]] +; CHECK-NEXT: [[TMP9:%.*]] = trunc i160 [[TMP8]] to i128, !dbg [[DBG30]] +; CHECK-NEXT: [[BUF_PTR_4_PTR_RSRC:%.*]] = inttoptr i128 [[TMP9]] to ptr addrspace(8), !dbg [[DBG30]] +; CHECK-NEXT: [[BUF_PTR_4_PTR_OFF:%.*]] = trunc i160 [[BUF_PTR_4]] to i32, !dbg [[DBG30]] +; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 [[BUF_PTR_4_PTR_OFF]], metadata [[META18:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 128, 32)), !dbg [[DBG31:![0-9]+]] +; CHECK-NEXT: call void @llvm.dbg.value(metadata ptr addrspace(8) [[BUF_PTR_4_PTR_RSRC]], metadata [[META18]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 128)), !dbg [[DBG30]] +; CHECK-NEXT: [[RET:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) [[BUF_PTR_4_PTR_RSRC]], i32 [[BUF_PTR_4_PTR_OFF]], i32 0, i32 0), !dbg [[DBG31]], !amdgpu.align !32 +; CHECK-NEXT: call void @llvm.dbg.value(metadata float [[RET]], metadata [[META19:![0-9]+]], metadata !DIExpression()), !dbg [[DBG31]] +; CHECK-NEXT: [[AUX_PTR_2:%.*]] = load i160, ptr addrspace(5) [[AUX_PTR_VAR]], align 32, !dbg [[DBG33:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = lshr i160 [[AUX_PTR_2]], 32, !dbg [[DBG33]] +; CHECK-NEXT: [[TMP11:%.*]] = trunc i160 [[TMP10]] to i128, !dbg [[DBG33]] +; CHECK-NEXT: [[AUX_PTR_2_PTR_RSRC:%.*]] = inttoptr i128 [[TMP11]] to ptr addrspace(8), !dbg [[DBG33]] +; CHECK-NEXT: [[AUX_PTR_2_PTR_OFF:%.*]] = trunc i160 [[AUX_PTR_2]] to i32, !dbg [[DBG33]] +; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 [[AUX_PTR_2_PTR_OFF]], metadata [[META20:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 128, 32)), !dbg [[DBG34:![0-9]+]] +; CHECK-NEXT: call void @llvm.dbg.value(metadata ptr addrspace(8) [[AUX_PTR_2_PTR_RSRC]], metadata [[META20]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 128)), !dbg [[DBG33]] +; CHECK-NEXT: [[BUF_PTR_4_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[BUF_PTR_4_PTR_RSRC]] to i160, !dbg [[DBG34]] +; CHECK-NEXT: [[TMP12:%.*]] = shl nuw i160 [[BUF_PTR_4_PTR_INT_RSRC]], 32, !dbg [[DBG34]] +; CHECK-NEXT: [[BUF_PTR_4_PTR_INT_OFF:%.*]] = zext i32 [[BUF_PTR_4_PTR_OFF]] to i160, !dbg [[DBG34]] +; CHECK-NEXT: [[BUF_PTR_4_PTR_INT:%.*]] = or i160 [[TMP12]], [[BUF_PTR_4_PTR_INT_OFF]], !dbg [[DBG34]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i160(i160 [[BUF_PTR_4_PTR_INT]], ptr addrspace(8) [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_OFF]], i32 0, i32 0), !dbg [[DBG34]], !amdgpu.align !35 +; CHECK-NEXT: ret float [[RET]], !dbg [[DBG36:![0-9]+]] +; + %buf.ptr.var = alloca ptr addrspace(7), align 32, addrspace(5), !dbg !20 + call void @llvm.dbg.value(metadata ptr addrspace(5) %buf.ptr.var, metadata !9, metadata !DIExpression()), !dbg !20 + %aux.ptr.var = alloca ptr addrspace(7), align 32, addrspace(5), !dbg !21 + call void @llvm.dbg.value(metadata ptr addrspace(5) %aux.ptr.var, metadata !11, metadata !DIExpression()), !dbg !21 + %buf.ptr = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7), !dbg !22 + call void @llvm.dbg.value(metadata ptr addrspace(7) %buf.ptr, metadata !12, metadata !DIExpression()), !dbg !22 + store ptr addrspace(7) %buf.ptr, ptr addrspace(5) %buf.ptr.var, align 32, !dbg !23 + %aux.ptr = addrspacecast ptr addrspace(8) %aux to ptr addrspace(7), !dbg !24 + call void @llvm.dbg.value(metadata ptr addrspace(7) %aux.ptr, metadata !14, metadata !DIExpression()), !dbg !24 + store ptr addrspace(7) %aux.ptr, ptr addrspace(5) %aux.ptr.var, align 32, !dbg !25 + %buf.ptr.2 = load ptr addrspace(7), ptr addrspace(5) %buf.ptr.var, align 32, !dbg !26 + call void @llvm.dbg.value(metadata ptr addrspace(7) %buf.ptr.2, metadata !15, metadata !DIExpression()), !dbg !26 + %buf.ptr.3 = getelementptr float, ptr addrspace(7) %buf.ptr.2, i32 %idx, !dbg !27 + call void @llvm.dbg.value(metadata ptr addrspace(7) %buf.ptr.3, metadata !16, metadata !DIExpression()), !dbg !27 + store ptr addrspace(7) %buf.ptr.3, ptr addrspace(5) %buf.ptr.var, align 32, !dbg !28 + %buf.ptr.4 = load ptr addrspace(7), ptr addrspace(5) %buf.ptr.var, align 32, !dbg !29 + call void @llvm.dbg.value(metadata ptr addrspace(7) %buf.ptr.4, metadata !17, metadata !DIExpression()), !dbg !29 + %ret = load float, ptr addrspace(7) %buf.ptr.4, align 4, !dbg !30 + call void @llvm.dbg.value(metadata float %ret, metadata !18, metadata !DIExpression()), !dbg !30 + %aux.ptr.2 = load ptr addrspace(7), ptr addrspace(5) %aux.ptr.var, align 32, !dbg !31 + call void @llvm.dbg.value(metadata ptr addrspace(7) %aux.ptr.2, metadata !19, metadata !DIExpression()), !dbg !31 + store ptr addrspace(7) %buf.ptr.4, ptr addrspace(7) %aux.ptr.2, align 32, !dbg !32 + ret float %ret, !dbg !33 +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare void @llvm.dbg.value(metadata, metadata, metadata) #0 + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } + +!llvm.dbg.cu = !{!0} +!llvm.debugify = !{!2, !3} +!llvm.module.flags = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!1 = !DIFile(filename: "", directory: "/") +!2 = !{i32 14} +!3 = !{i32 9} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = distinct !DISubprogram(name: "debug_stash_pointer", linkageName: "debug_stash_pointer", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8) +!6 = !DISubroutineType(types: !7) +!7 = !{} +!8 = !{!9, !11, !12, !14, !15, !16, !17, !18, !19} +!9 = !DILocalVariable(name: "1", scope: !5, file: !1, line: 1, type: !10) +!10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned) +!11 = !DILocalVariable(name: "2", scope: !5, file: !1, line: 2, type: !10) +!12 = !DILocalVariable(name: "3", scope: !5, file: !1, line: 3, type: !13) +!13 = !DIBasicType(name: "ty256", size: 256, encoding: DW_ATE_unsigned) +!14 = !DILocalVariable(name: "4", scope: !5, file: !1, line: 5, type: !13) +!15 = !DILocalVariable(name: "5", scope: !5, file: !1, line: 7, type: !13) +!16 = !DILocalVariable(name: "6", scope: !5, file: !1, line: 8, type: !13) +!17 = !DILocalVariable(name: "7", scope: !5, file: !1, line: 10, type: !13) +!18 = !DILocalVariable(name: "8", scope: !5, file: !1, line: 11, type: !10) +!19 = !DILocalVariable(name: "9", scope: !5, file: !1, line: 12, type: !13) +!20 = !DILocation(line: 1, column: 1, scope: !5) +!21 = !DILocation(line: 2, column: 1, scope: !5) +!22 = !DILocation(line: 3, column: 1, scope: !5) +!23 = !DILocation(line: 4, column: 1, scope: !5) +!24 = !DILocation(line: 5, column: 1, scope: !5) +!25 = !DILocation(line: 6, column: 1, scope: !5) +!26 = !DILocation(line: 7, column: 1, scope: !5) +!27 = !DILocation(line: 8, column: 1, scope: !5) +!28 = !DILocation(line: 9, column: 1, scope: !5) +!29 = !DILocation(line: 10, column: 1, scope: !5) +!30 = !DILocation(line: 11, column: 1, scope: !5) +!31 = !DILocation(line: 12, column: 1, scope: !5) +!32 = !DILocation(line: 13, column: 1, scope: !5) +!33 = !DILocation(line: 14, column: 1, scope: !5) diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn --- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn @@ -153,6 +153,7 @@ "AMDGPULegalizerInfo.cpp", "AMDGPULibCalls.cpp", "AMDGPULibFunc.cpp", + "AMDGPULowerBufferFatPointers.cpp", "AMDGPULowerKernelArguments.cpp", "AMDGPULowerKernelAttributes.cpp", "AMDGPULowerModuleLDSPass.cpp",