Index: llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -6,8 +6,9 @@ // //===----------------------------------------------------------------------===// // -/// \file This pass adds target attributes to functions which use intrinsics -/// which will impact calling convention lowering. +/// \file This pass propagates the uniform-work-group-size attribute from +/// kernels to leaf functions when possible. It also adds additional attributes +/// to hint ABI lowering optimizations later. // //===----------------------------------------------------------------------===// @@ -25,22 +26,14 @@ using namespace llvm; namespace { -static constexpr StringLiteral ImplicitAttrNames[] = { - // X ids unnecessarily propagated to kernels. - "amdgpu-work-item-id-x", "amdgpu-work-item-id-y", - "amdgpu-work-item-id-z", "amdgpu-work-group-id-x", - "amdgpu-work-group-id-y", "amdgpu-work-group-id-z", - "amdgpu-dispatch-ptr", "amdgpu-dispatch-id", - "amdgpu-queue-ptr", "amdgpu-implicitarg-ptr"}; - class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { private: const TargetMachine *TM = nullptr; SmallVector NodeList; - bool addFeatureAttributes(Function &F); bool processUniformWorkGroupAttribute(); bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee); + bool addFeatureAttributes(Function &F); public: static char ID; @@ -58,12 +51,6 @@ AU.setPreservesAll(); CallGraphSCCPass::getAnalysisUsage(AU); } - - static bool visitConstantExpr(const ConstantExpr *CE); - static bool visitConstantExprsRecursively( - const Constant *EntryC, - SmallPtrSet &ConstantExprVisited, bool IsFunc, - bool HasApertureRegs); }; } // end anonymous namespace @@ -75,137 +62,6 @@ INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, "Add AMDGPU function attributes", false, false) - -// The queue ptr is only needed when casting to flat, not from it. -static bool castRequiresQueuePtr(unsigned SrcAS) { - return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; -} - -static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) { - return castRequiresQueuePtr(ASC->getSrcAddressSpace()); -} - -static bool isDSAddress(const Constant *C) { - const GlobalValue *GV = dyn_cast(C); - if (!GV) - return false; - unsigned AS = GV->getAddressSpace(); - return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS; -} - -bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) { - if (CE->getOpcode() == Instruction::AddrSpaceCast) { - unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); - return castRequiresQueuePtr(SrcAS); - } - - return false; -} - -bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( - const Constant *EntryC, - SmallPtrSet &ConstantExprVisited, - bool IsFunc, bool HasApertureRegs) { - - if (!ConstantExprVisited.insert(EntryC).second) - return false; - - SmallVector Stack; - Stack.push_back(EntryC); - - while (!Stack.empty()) { - const Constant *C = Stack.pop_back_val(); - - // We need to trap on DS globals in non-entry functions. - if (IsFunc && isDSAddress(C)) - return true; - - // Check this constant expression. - if (const auto *CE = dyn_cast(C)) { - if (!HasApertureRegs && visitConstantExpr(CE)) - return true; - } - - // Visit all sub-expressions. - for (const Use &U : C->operands()) { - const auto *OpC = dyn_cast(U); - if (!OpC) - continue; - - if (!ConstantExprVisited.insert(OpC).second) - continue; - - Stack.push_back(OpC); - } - } - - return false; -} - -// We do not need to note the x workitem or workgroup id because they are always -// initialized. -// -// TODO: We should not add the attributes if the known compile time workgroup -// size is 1 for y/z. -static StringRef intrinsicToAttrName(Intrinsic::ID ID, - bool &NonKernelOnly, - bool &IsQueuePtr) { - switch (ID) { - case Intrinsic::amdgcn_workitem_id_x: - NonKernelOnly = true; - return "amdgpu-work-item-id-x"; - case Intrinsic::amdgcn_workgroup_id_x: - NonKernelOnly = true; - return "amdgpu-work-group-id-x"; - case Intrinsic::amdgcn_workitem_id_y: - case Intrinsic::r600_read_tidig_y: - return "amdgpu-work-item-id-y"; - case Intrinsic::amdgcn_workitem_id_z: - case Intrinsic::r600_read_tidig_z: - return "amdgpu-work-item-id-z"; - case Intrinsic::amdgcn_workgroup_id_y: - case Intrinsic::r600_read_tgid_y: - return "amdgpu-work-group-id-y"; - case Intrinsic::amdgcn_workgroup_id_z: - case Intrinsic::r600_read_tgid_z: - return "amdgpu-work-group-id-z"; - case Intrinsic::amdgcn_dispatch_ptr: - return "amdgpu-dispatch-ptr"; - case Intrinsic::amdgcn_dispatch_id: - return "amdgpu-dispatch-id"; - case Intrinsic::amdgcn_implicitarg_ptr: - return "amdgpu-implicitarg-ptr"; - case Intrinsic::amdgcn_queue_ptr: - case Intrinsic::amdgcn_is_shared: - case Intrinsic::amdgcn_is_private: - // TODO: Does not require queue ptr on gfx9+ - case Intrinsic::trap: - case Intrinsic::debugtrap: - IsQueuePtr = true; - return "amdgpu-queue-ptr"; - default: - return ""; - } -} - -static bool handleAttr(Function &Parent, const Function &Callee, - StringRef Name) { - if (Callee.hasFnAttribute(Name)) { - Parent.addFnAttr(Name); - return true; - } - return false; -} - -static void copyFeaturesToFunction(Function &Parent, const Function &Callee, - bool &NeedQueuePtr) { - if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) - NeedQueuePtr = true; - - for (StringRef AttrName : ImplicitAttrNames) - handleAttr(Parent, Callee, AttrName); -} - bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() { bool Changed = false; @@ -257,28 +113,10 @@ } bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { - const GCNSubtarget &ST = TM->getSubtarget(F); - bool HasApertureRegs = ST.hasApertureRegs(); - SmallPtrSet ConstantExprVisited; - bool HaveStackObjects = false; bool Changed = false; - bool NeedQueuePtr = false; bool HaveCall = false; - bool HasIndirectCall = false; bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv()); - CallingConv::ID CC = F.getCallingConv(); - bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx); - - // If this function hasAddressTaken() = true - // then add all attributes corresponding to the implicit args. - if (CallingConvSupportsAllImplicits && - F.hasAddressTaken(nullptr, true, true, true)) { - for (StringRef AttrName : ImplicitAttrNames) { - F.addFnAttr(AttrName); - } - Changed = true; - } for (BasicBlock &BB : F) { for (Instruction &I : BB) { @@ -293,59 +131,21 @@ // Note the occurence of indirect call. if (!Callee) { - if (!CB->isInlineAsm()) { - HasIndirectCall = true; + if (!CB->isInlineAsm()) HaveCall = true; - } + continue; } Intrinsic::ID IID = Callee->getIntrinsicID(); if (IID == Intrinsic::not_intrinsic) { HaveCall = true; - copyFeaturesToFunction(F, *Callee, NeedQueuePtr); Changed = true; - } else { - bool NonKernelOnly = false; - - StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly, - NeedQueuePtr); - if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) { - F.addFnAttr(AttrName); - Changed = true; - } - } - } - - if (NeedQueuePtr || (!IsFunc && HasApertureRegs)) - continue; - - if (const AddrSpaceCastInst *ASC = dyn_cast(&I)) { - if (!HasApertureRegs && castRequiresQueuePtr(ASC)) { - NeedQueuePtr = true; - continue; - } - } - - for (const Use &U : I.operands()) { - const auto *OpC = dyn_cast(U); - if (!OpC) - continue; - - if (visitConstantExprsRecursively(OpC, ConstantExprVisited, IsFunc, - HasApertureRegs)) { - NeedQueuePtr = true; - break; } } } } - if (NeedQueuePtr) { - F.addFnAttr("amdgpu-queue-ptr"); - Changed = true; - } - // TODO: We could refine this to captured pointers that could possibly be // accessed by flat instructions. For now this is mostly a poor way of // estimating whether there are calls before argument lowering. @@ -359,28 +159,6 @@ Changed = true; } - // This pass cannot copy attributes from callees to callers - // if there is an indirect call and in thus such cases, - // hasAddressTaken() would be false for kernels and functions - // making an indirect call (if they are themselves not indirectly called). - // We must tag all such kernels/functions with all implicits attributes - // for correctness. - // e.g. - // 1. Kernel K1 makes an indirect call to function F1. - // Without detecting an indirect call in K1, this pass will not - // add all implicit args to K1 (which is incorrect). - // 2. Kernel K1 makes direct call to F1 which makes indirect call to function - // F2. - // Without detecting an indirect call in F1 (whose hasAddressTaken() is - // false), the pass will not add all implicit args to F1 (which is - // essential for correctness). - if (CallingConvSupportsAllImplicits && HasIndirectCall) { - for (StringRef AttrName : ImplicitAttrNames) { - F.addFnAttr(AttrName); - } - Changed = true; - } - return Changed; } Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1101,8 +1101,13 @@ } void AMDGPUPassConfig::addCodeGenPrepare() { - if (TM->getTargetTriple().getArch() == Triple::amdgcn) + if (TM->getTargetTriple().getArch() == Triple::amdgcn) { + addPass(createAMDGPUAttributorPass()); + + // FIXME: This pass adds 2 hacky attributes that can be replaced with an + // analysis, and should be removed. addPass(createAMDGPUAnnotateKernelFeaturesPass()); + } if (TM->getTargetTriple().getArch() == Triple::amdgcn && EnableLowerKernelArguments) Index: llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -67,8 +67,10 @@ const bool UseFixedABI = AMDGPUTargetMachine::EnableFixedFunctionABI && CC != CallingConv::AMDGPU_Gfx && (!isEntryFunction() || HasCalls); + const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL || + CC == CallingConv::SPIR_KERNEL; - if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) { + if (IsKernel) { if (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0) KernargSegmentPtr = true; WorkGroupIDX = true; @@ -94,45 +96,76 @@ ArgDescriptor::createRegister(ScratchRSrcReg); } - if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) + if (!F.hasFnAttribute("amdgpu-no-implicitarg-ptr")) ImplicitArgPtr = true; } else { - if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) { - KernargSegmentPtr = true; - MaxKernArgAlign = std::max(ST.getAlignmentForImplicitArgPtr(), - MaxKernArgAlign); - } + ImplicitArgPtr = false; + MaxKernArgAlign = std::max(ST.getAlignmentForImplicitArgPtr(), + MaxKernArgAlign); } + bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); + if (isAmdHsaOrMesa && !ST.enableFlatScratch()) + PrivateSegmentBuffer = true; + else if (ST.isMesaGfxShader(F)) + ImplicitBufferPtr = true; + if (UseFixedABI) { + DispatchPtr = true; + QueuePtr = true; + ImplicitArgPtr = true; WorkGroupIDX = true; WorkGroupIDY = true; WorkGroupIDZ = true; WorkItemIDX = true; WorkItemIDY = true; WorkItemIDZ = true; - ImplicitArgPtr = true; - } else { - if (F.hasFnAttribute("amdgpu-work-group-id-x")) + + // FIXME: We don't need this? + DispatchID = true; + } else if (!AMDGPU::isGraphics(CC)) { + if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x")) WorkGroupIDX = true; - if (F.hasFnAttribute("amdgpu-work-group-id-y")) + if (!F.hasFnAttribute("amdgpu-no-workgroup-id-y")) WorkGroupIDY = true; - if (F.hasFnAttribute("amdgpu-work-group-id-z")) + if (!F.hasFnAttribute("amdgpu-no-workgroup-id-z")) WorkGroupIDZ = true; - if (F.hasFnAttribute("amdgpu-work-item-id-x")) + if (IsKernel || !F.hasFnAttribute("amdgpu-no-workitem-id-x")) WorkItemIDX = true; - if (F.hasFnAttribute("amdgpu-work-item-id-y")) + if (!F.hasFnAttribute("amdgpu-no-workitem-id-y")) WorkItemIDY = true; - if (F.hasFnAttribute("amdgpu-work-item-id-z")) + if (!F.hasFnAttribute("amdgpu-no-workitem-id-z")) WorkItemIDZ = true; + + if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr")) + DispatchPtr = true; + + if (!F.hasFnAttribute("amdgpu-no-queue-ptr")) + QueuePtr = true; + + if (!F.hasFnAttribute("amdgpu-no-dispatch-id")) + DispatchID = true; } + // FIXME: This attribute is a hack, we just need an analysis on the function + // to look for allocas. bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects"); + + // TODO: This could be refined a lot. The attribute is a poor way of + // detecting calls or stack objects that may require it before argument + // lowering. + if (ST.hasFlatAddressSpace() && isEntryFunction() && + (isAmdHsaOrMesa || ST.enableFlatScratch()) && + (HasCalls || HasStackObjects || ST.enableFlatScratch()) && + !ST.flatScratchIsArchitected()) { + FlatScratchInit = true; + } + if (isEntryFunction()) { // X, XY, and XYZ are the only supported combinations, so make sure Y is // enabled if Z is. @@ -150,41 +183,6 @@ } } - bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); - if (isAmdHsaOrMesa && !ST.enableFlatScratch()) - PrivateSegmentBuffer = true; - else if (ST.isMesaGfxShader(F)) - ImplicitBufferPtr = true; - - if (!AMDGPU::isGraphics(CC)) { - if (UseFixedABI) { - DispatchPtr = true; - QueuePtr = true; - - // FIXME: We don't need this? - DispatchID = true; - } else { - if (F.hasFnAttribute("amdgpu-dispatch-ptr")) - DispatchPtr = true; - - if (F.hasFnAttribute("amdgpu-queue-ptr")) - QueuePtr = true; - - if (F.hasFnAttribute("amdgpu-dispatch-id")) - DispatchID = true; - } - } - - // TODO: This could be refined a lot. The attribute is a poor way of - // detecting calls or stack objects that may require it before argument - // lowering. - if (ST.hasFlatAddressSpace() && isEntryFunction() && - (isAmdHsaOrMesa || ST.enableFlatScratch()) && - (HasCalls || HasStackObjects || ST.enableFlatScratch()) && - !ST.flatScratchIsArchitected()) { - FlatScratchInit = true; - } - Attribute A = F.getFnAttribute("amdgpu-git-ptr-high"); StringRef S = A.getValueAsString(); if (!S.empty()) Index: llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll +++ llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll @@ -27,30 +27,45 @@ } define amdgpu_kernel void @store_cast_0_group_to_flat_addrspacecast() #1 { -; HSA-LABEL: define {{[^@]+}}@store_cast_0_group_to_flat_addrspacecast -; HSA-SAME: () #[[ATTR2:[0-9]+]] { -; HSA-NEXT: store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@store_cast_0_group_to_flat_addrspacecast +; AKF_HSA-SAME: () #[[ATTR1]] { +; AKF_HSA-NEXT: store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_cast_0_group_to_flat_addrspacecast +; ATTRIBUTOR_HSA-SAME: () #[[ATTR2:[0-9]+]] { +; ATTRIBUTOR_HSA-NEXT: store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*) ret void } define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #1 { -; HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_to_flat -; HSA-SAME: () #[[ATTR2]] { -; HSA-NEXT: store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds.i32 to i32 addrspace(4)*), align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_to_flat +; AKF_HSA-SAME: () #[[ATTR1]] { +; AKF_HSA-NEXT: store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds.i32 to i32 addrspace(4)*), align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_to_flat +; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] { +; ATTRIBUTOR_HSA-NEXT: store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds.i32 to i32 addrspace(4)*), align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds.i32 to i32 addrspace(4)*) ret void } define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat() #1 { -; HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat -; HSA-SAME: () #[[ATTR2]] { -; HSA-NEXT: store i32 7, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat +; AKF_HSA-SAME: () #[[ATTR1]] { +; AKF_HSA-NEXT: store i32 7, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat +; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] { +; ATTRIBUTOR_HSA-NEXT: store i32 7, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; store i32 7, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) ret void @@ -77,11 +92,17 @@ } define amdgpu_kernel void @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 { -; HSA-LABEL: define {{[^@]+}}@load_constant_cast_group_gv_gep_to_flat -; HSA-SAME: (i32 addrspace(1)* [[OUT:%.*]]) #[[ATTR2]] { -; HSA-NEXT: [[VAL:%.*]] = load i32, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), align 4 -; HSA-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[OUT]], align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@load_constant_cast_group_gv_gep_to_flat +; AKF_HSA-SAME: (i32 addrspace(1)* [[OUT:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: [[VAL:%.*]] = load i32, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), align 4 +; AKF_HSA-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[OUT]], align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@load_constant_cast_group_gv_gep_to_flat +; ATTRIBUTOR_HSA-SAME: (i32 addrspace(1)* [[OUT:%.*]]) #[[ATTR2]] { +; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = load i32, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), align 4 +; ATTRIBUTOR_HSA-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[OUT]], align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; %val = load i32, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) store i32 %val, i32 addrspace(1)* %out @@ -89,11 +110,17 @@ } define amdgpu_kernel void @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 { -; HSA-LABEL: define {{[^@]+}}@atomicrmw_constant_cast_group_gv_gep_to_flat -; HSA-SAME: (i32 addrspace(1)* [[OUT:%.*]]) #[[ATTR2]] { -; HSA-NEXT: [[VAL:%.*]] = atomicrmw add i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 1 seq_cst, align 4 -; HSA-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[OUT]], align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@atomicrmw_constant_cast_group_gv_gep_to_flat +; AKF_HSA-SAME: (i32 addrspace(1)* [[OUT:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: [[VAL:%.*]] = atomicrmw add i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 1 seq_cst, align 4 +; AKF_HSA-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[OUT]], align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@atomicrmw_constant_cast_group_gv_gep_to_flat +; ATTRIBUTOR_HSA-SAME: (i32 addrspace(1)* [[OUT:%.*]]) #[[ATTR2]] { +; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = atomicrmw add i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 1 seq_cst, align 4 +; ATTRIBUTOR_HSA-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[OUT]], align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; %val = atomicrmw add i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 1 seq_cst store i32 %val, i32 addrspace(1)* %out @@ -101,12 +128,19 @@ } define amdgpu_kernel void @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 { -; HSA-LABEL: define {{[^@]+}}@cmpxchg_constant_cast_group_gv_gep_to_flat -; HSA-SAME: (i32 addrspace(1)* [[OUT:%.*]]) #[[ATTR2]] { -; HSA-NEXT: [[VAL:%.*]] = cmpxchg i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 0, i32 1 seq_cst seq_cst, align 4 -; HSA-NEXT: [[VAL0:%.*]] = extractvalue { i32, i1 } [[VAL]], 0 -; HSA-NEXT: store i32 [[VAL0]], i32 addrspace(1)* [[OUT]], align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@cmpxchg_constant_cast_group_gv_gep_to_flat +; AKF_HSA-SAME: (i32 addrspace(1)* [[OUT:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: [[VAL:%.*]] = cmpxchg i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 0, i32 1 seq_cst seq_cst, align 4 +; AKF_HSA-NEXT: [[VAL0:%.*]] = extractvalue { i32, i1 } [[VAL]], 0 +; AKF_HSA-NEXT: store i32 [[VAL0]], i32 addrspace(1)* [[OUT]], align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@cmpxchg_constant_cast_group_gv_gep_to_flat +; ATTRIBUTOR_HSA-SAME: (i32 addrspace(1)* [[OUT:%.*]]) #[[ATTR2]] { +; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = cmpxchg i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 0, i32 1 seq_cst seq_cst, align 4 +; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = extractvalue { i32, i1 } [[VAL]], 0 +; ATTRIBUTOR_HSA-NEXT: store i32 [[VAL0]], i32 addrspace(1)* [[OUT]], align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; %val = cmpxchg i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 0, i32 1 seq_cst seq_cst %val0 = extractvalue { i32, i1 } %val, 0 @@ -115,10 +149,15 @@ } define amdgpu_kernel void @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 { -; HSA-LABEL: define {{[^@]+}}@memcpy_constant_cast_group_gv_gep_to_flat -; HSA-SAME: (i32 addrspace(1)* [[OUT:%.*]]) #[[ATTR2]] { -; HSA-NEXT: call void @llvm.memcpy.p1i32.p4i32.i32(i32 addrspace(1)* align 4 [[OUT]], i32 addrspace(4)* align 4 getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 32, i1 false) -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@memcpy_constant_cast_group_gv_gep_to_flat +; AKF_HSA-SAME: (i32 addrspace(1)* [[OUT:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: call void @llvm.memcpy.p1i32.p4i32.i32(i32 addrspace(1)* align 4 [[OUT]], i32 addrspace(4)* align 4 getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 32, i1 false) +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@memcpy_constant_cast_group_gv_gep_to_flat +; ATTRIBUTOR_HSA-SAME: (i32 addrspace(1)* [[OUT:%.*]]) #[[ATTR2]] { +; ATTRIBUTOR_HSA-NEXT: call void @llvm.memcpy.p1i32.p4i32.i32(i32 addrspace(1)* align 4 [[OUT]], i32 addrspace(4)* align 4 getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 32, i1 false) +; ATTRIBUTOR_HSA-NEXT: ret void ; call void @llvm.memcpy.p1i32.p4i32.i32(i32 addrspace(1)* align 4 %out, i32 addrspace(4)* align 4 getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 32, i1 false) ret void @@ -126,10 +165,15 @@ ; Can't just search the pointer value define amdgpu_kernel void @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #1 { -; HSA-LABEL: define {{[^@]+}}@store_value_constant_cast_lds_gv_gep_to_flat -; HSA-SAME: (i32 addrspace(4)* addrspace(1)* [[OUT:%.*]]) #[[ATTR2]] { -; HSA-NEXT: store i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 addrspace(4)* addrspace(1)* [[OUT]], align 8 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@store_value_constant_cast_lds_gv_gep_to_flat +; AKF_HSA-SAME: (i32 addrspace(4)* addrspace(1)* [[OUT:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: store i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 addrspace(4)* addrspace(1)* [[OUT]], align 8 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_value_constant_cast_lds_gv_gep_to_flat +; ATTRIBUTOR_HSA-SAME: (i32 addrspace(4)* addrspace(1)* [[OUT:%.*]]) #[[ATTR2]] { +; ATTRIBUTOR_HSA-NEXT: store i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 addrspace(4)* addrspace(1)* [[OUT]], align 8 +; ATTRIBUTOR_HSA-NEXT: ret void ; store i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 addrspace(4)* addrspace(1)* %out ret void @@ -137,10 +181,15 @@ ; Can't just search pointer types define amdgpu_kernel void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #1 { -; HSA-LABEL: define {{[^@]+}}@store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat -; HSA-SAME: (i64 addrspace(1)* [[OUT:%.*]]) #[[ATTR2]] { -; HSA-NEXT: store i64 ptrtoint (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i64), i64 addrspace(1)* [[OUT]], align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat +; AKF_HSA-SAME: (i64 addrspace(1)* [[OUT:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: store i64 ptrtoint (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i64), i64 addrspace(1)* [[OUT]], align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat +; ATTRIBUTOR_HSA-SAME: (i64 addrspace(1)* [[OUT:%.*]]) #[[ATTR2]] { +; ATTRIBUTOR_HSA-NEXT: store i64 ptrtoint (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i64), i64 addrspace(1)* [[OUT]], align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; store i64 ptrtoint (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i64), i64 addrspace(1)* %out ret void @@ -148,19 +197,28 @@ ; Cast group to flat, do GEP, cast back to group define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat_to_group() #1 { -; HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat_to_group -; HSA-SAME: () #[[ATTR2]] { -; HSA-NEXT: store i32 7, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i32 addrspace(3)*), align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat_to_group +; AKF_HSA-SAME: () #[[ATTR1]] { +; AKF_HSA-NEXT: store i32 7, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i32 addrspace(3)*), align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat_to_group +; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] { +; ATTRIBUTOR_HSA-NEXT: store i32 7, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i32 addrspace(3)*), align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; store i32 7, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i32 addrspace(3)*) ret void } define i32 addrspace(3)* @ret_constant_cast_group_gv_gep_to_flat_to_group() #1 { -; HSA-LABEL: define {{[^@]+}}@ret_constant_cast_group_gv_gep_to_flat_to_group -; HSA-SAME: () #[[ATTR2]] { -; HSA-NEXT: ret i32 addrspace(3)* addrspacecast (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i32 addrspace(3)*) +; AKF_HSA-LABEL: define {{[^@]+}}@ret_constant_cast_group_gv_gep_to_flat_to_group +; AKF_HSA-SAME: () #[[ATTR1]] { +; AKF_HSA-NEXT: ret i32 addrspace(3)* addrspacecast (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i32 addrspace(3)*) +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@ret_constant_cast_group_gv_gep_to_flat_to_group +; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] { +; ATTRIBUTOR_HSA-NEXT: ret i32 addrspace(3)* addrspacecast (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i32 addrspace(3)*) ; ret i32 addrspace(3)* addrspacecast (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i32 addrspace(3)*) } @@ -170,7 +228,6 @@ ;. ; AKF_HSA: attributes #[[ATTR0:[0-9]+]] = { argmemonly nofree nounwind willreturn } ; AKF_HSA: attributes #[[ATTR1]] = { nounwind } -; AKF_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-queue-ptr" } ;. ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { argmemonly nofree nounwind willreturn } ; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } Index: llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll +++ llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -38,7 +38,7 @@ define void @use_workitem_id_y() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@use_workitem_id_y -; AKF_HSA-SAME: () #[[ATTR2:[0-9]+]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; AKF_HSA-NEXT: store volatile i32 [[VAL]], i32 addrspace(1)* undef, align 4 ; AKF_HSA-NEXT: ret void @@ -56,7 +56,7 @@ define void @use_workitem_id_z() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@use_workitem_id_z -; AKF_HSA-SAME: () #[[ATTR3:[0-9]+]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; AKF_HSA-NEXT: store volatile i32 [[VAL]], i32 addrspace(1)* undef, align 4 ; AKF_HSA-NEXT: ret void @@ -74,7 +74,7 @@ define void @use_workgroup_id_x() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_x -; AKF_HSA-SAME: () #[[ATTR4:[0-9]+]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() ; AKF_HSA-NEXT: store volatile i32 [[VAL]], i32 addrspace(1)* undef, align 4 ; AKF_HSA-NEXT: ret void @@ -92,7 +92,7 @@ define void @use_workgroup_id_y() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_y -; AKF_HSA-SAME: () #[[ATTR5:[0-9]+]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() ; AKF_HSA-NEXT: store volatile i32 [[VAL]], i32 addrspace(1)* undef, align 4 ; AKF_HSA-NEXT: ret void @@ -110,7 +110,7 @@ define void @use_workgroup_id_z() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_z -; AKF_HSA-SAME: () #[[ATTR6:[0-9]+]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() ; AKF_HSA-NEXT: store volatile i32 [[VAL]], i32 addrspace(1)* undef, align 4 ; AKF_HSA-NEXT: ret void @@ -128,7 +128,7 @@ define void @use_dispatch_ptr() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr -; AKF_HSA-SAME: () #[[ATTR7:[0-9]+]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() ; AKF_HSA-NEXT: store volatile i8 addrspace(4)* [[DISPATCH_PTR]], i8 addrspace(4)* addrspace(1)* undef, align 8 ; AKF_HSA-NEXT: ret void @@ -146,7 +146,7 @@ define void @use_queue_ptr() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@use_queue_ptr -; AKF_HSA-SAME: () #[[ATTR8:[0-9]+]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: [[QUEUE_PTR:%.*]] = call i8 addrspace(4)* @llvm.amdgcn.queue.ptr() ; AKF_HSA-NEXT: store volatile i8 addrspace(4)* [[QUEUE_PTR]], i8 addrspace(4)* addrspace(1)* undef, align 8 ; AKF_HSA-NEXT: ret void @@ -164,7 +164,7 @@ define void @use_dispatch_id() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_id -; AKF_HSA-SAME: () #[[ATTR9:[0-9]+]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: [[VAL:%.*]] = call i64 @llvm.amdgcn.dispatch.id() ; AKF_HSA-NEXT: store volatile i64 [[VAL]], i64 addrspace(1)* undef, align 4 ; AKF_HSA-NEXT: ret void @@ -182,7 +182,7 @@ define void @use_workgroup_id_y_workgroup_id_z() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_y_workgroup_id_z -; AKF_HSA-SAME: () #[[ATTR10:[0-9]+]] { +; AKF_HSA-SAME: () #[[ATTR2:[0-9]+]] { ; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() ; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() ; AKF_HSA-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* undef, align 4 @@ -236,7 +236,7 @@ define void @func_indirect_use_workitem_id_y() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_y -; AKF_HSA-SAME: () #[[ATTR2]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: call void @use_workitem_id_y() ; AKF_HSA-NEXT: ret void ; @@ -251,7 +251,7 @@ define void @func_indirect_use_workitem_id_z() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_z -; AKF_HSA-SAME: () #[[ATTR3]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: call void @use_workitem_id_z() ; AKF_HSA-NEXT: ret void ; @@ -266,7 +266,7 @@ define void @func_indirect_use_workgroup_id_x() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_x -; AKF_HSA-SAME: () #[[ATTR4]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: call void @use_workgroup_id_x() ; AKF_HSA-NEXT: ret void ; @@ -281,7 +281,7 @@ define void @kernel_indirect_use_workgroup_id_x() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@kernel_indirect_use_workgroup_id_x -; AKF_HSA-SAME: () #[[ATTR4]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: call void @use_workgroup_id_x() ; AKF_HSA-NEXT: ret void ; @@ -296,7 +296,7 @@ define void @func_indirect_use_workgroup_id_y() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_y -; AKF_HSA-SAME: () #[[ATTR5]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: call void @use_workgroup_id_y() ; AKF_HSA-NEXT: ret void ; @@ -311,7 +311,7 @@ define void @func_indirect_use_workgroup_id_z() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_z -; AKF_HSA-SAME: () #[[ATTR6]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: call void @use_workgroup_id_z() ; AKF_HSA-NEXT: ret void ; @@ -326,7 +326,7 @@ define void @func_indirect_indirect_use_workgroup_id_y() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_indirect_use_workgroup_id_y -; AKF_HSA-SAME: () #[[ATTR5]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: call void @func_indirect_use_workgroup_id_y() ; AKF_HSA-NEXT: ret void ; @@ -341,7 +341,7 @@ define void @indirect_x2_use_workgroup_id_y() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@indirect_x2_use_workgroup_id_y -; AKF_HSA-SAME: () #[[ATTR5]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: call void @func_indirect_indirect_use_workgroup_id_y() ; AKF_HSA-NEXT: ret void ; @@ -356,7 +356,7 @@ define void @func_indirect_use_dispatch_ptr() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr -; AKF_HSA-SAME: () #[[ATTR7]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: call void @use_dispatch_ptr() ; AKF_HSA-NEXT: ret void ; @@ -371,7 +371,7 @@ define void @func_indirect_use_queue_ptr() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_queue_ptr -; AKF_HSA-SAME: () #[[ATTR8]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: call void @use_queue_ptr() ; AKF_HSA-NEXT: ret void ; @@ -386,7 +386,7 @@ define void @func_indirect_use_dispatch_id() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_id -; AKF_HSA-SAME: () #[[ATTR9]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: call void @use_dispatch_id() ; AKF_HSA-NEXT: ret void ; @@ -401,7 +401,7 @@ define void @func_indirect_use_workgroup_id_y_workgroup_id_z() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_y_workgroup_id_z -; AKF_HSA-SAME: () #[[ATTR11:[0-9]+]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: call void @func_indirect_use_workgroup_id_y_workgroup_id_z() ; AKF_HSA-NEXT: ret void ; @@ -416,7 +416,7 @@ define void @recursive_use_workitem_id_y() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@recursive_use_workitem_id_y -; AKF_HSA-SAME: () #[[ATTR2]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; AKF_HSA-NEXT: store volatile i32 [[VAL]], i32 addrspace(1)* undef, align 4 ; AKF_HSA-NEXT: call void @recursive_use_workitem_id_y() @@ -437,7 +437,7 @@ define void @call_recursive_use_workitem_id_y() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@call_recursive_use_workitem_id_y -; AKF_HSA-SAME: () #[[ATTR2]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: call void @recursive_use_workitem_id_y() ; AKF_HSA-NEXT: ret void ; @@ -452,7 +452,7 @@ define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast -; AKF_HSA-SAME: (i32 addrspace(3)* [[PTR:%.*]]) #[[ATTR8]] { +; AKF_HSA-SAME: (i32 addrspace(3)* [[PTR:%.*]]) #[[ATTR1]] { ; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast i32 addrspace(3)* [[PTR]] to i32 addrspace(4)* ; AKF_HSA-NEXT: store volatile i32 0, i32 addrspace(4)* [[STOF]], align 4 ; AKF_HSA-NEXT: ret void @@ -471,7 +471,7 @@ define void @use_group_to_flat_addrspacecast_gfx9(i32 addrspace(3)* %ptr) #2 { ; AKF_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast_gfx9 -; AKF_HSA-SAME: (i32 addrspace(3)* [[PTR:%.*]]) #[[ATTR12:[0-9]+]] { +; AKF_HSA-SAME: (i32 addrspace(3)* [[PTR:%.*]]) #[[ATTR3:[0-9]+]] { ; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast i32 addrspace(3)* [[PTR]] to i32 addrspace(4)* ; AKF_HSA-NEXT: store volatile i32 0, i32 addrspace(4)* [[STOF]], align 4 ; AKF_HSA-NEXT: ret void @@ -489,7 +489,7 @@ define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(i32 addrspace(3)* %ptr) #2 { ; AKF_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast_queue_ptr_gfx9 -; AKF_HSA-SAME: (i32 addrspace(3)* [[PTR:%.*]]) #[[ATTR13:[0-9]+]] { +; AKF_HSA-SAME: (i32 addrspace(3)* [[PTR:%.*]]) #[[ATTR3]] { ; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast i32 addrspace(3)* [[PTR]] to i32 addrspace(4)* ; AKF_HSA-NEXT: store volatile i32 0, i32 addrspace(4)* [[STOF]], align 4 ; AKF_HSA-NEXT: call void @func_indirect_use_queue_ptr() @@ -510,7 +510,7 @@ define void @indirect_use_group_to_flat_addrspacecast() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast -; AKF_HSA-SAME: () #[[ATTR8]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: call void @use_group_to_flat_addrspacecast(i32 addrspace(3)* null) ; AKF_HSA-NEXT: ret void ; @@ -525,7 +525,7 @@ define void @indirect_use_group_to_flat_addrspacecast_gfx9() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_gfx9 -; AKF_HSA-SAME: () #[[ATTR11]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: call void @use_group_to_flat_addrspacecast_gfx9(i32 addrspace(3)* null) ; AKF_HSA-NEXT: ret void ; @@ -540,7 +540,7 @@ define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9 -; AKF_HSA-SAME: () #[[ATTR8]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: call void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(i32 addrspace(3)* null) ; AKF_HSA-NEXT: ret void ; @@ -555,7 +555,7 @@ define void @use_kernarg_segment_ptr() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@use_kernarg_segment_ptr -; AKF_HSA-SAME: () #[[ATTR11]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: [[KERNARG_SEGMENT_PTR:%.*]] = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() ; AKF_HSA-NEXT: store volatile i8 addrspace(4)* [[KERNARG_SEGMENT_PTR]], i8 addrspace(4)* addrspace(1)* undef, align 8 ; AKF_HSA-NEXT: ret void @@ -572,7 +572,7 @@ } define void @func_indirect_use_kernarg_segment_ptr() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_kernarg_segment_ptr -; AKF_HSA-SAME: () #[[ATTR11]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: call void @use_kernarg_segment_ptr() ; AKF_HSA-NEXT: ret void ; @@ -587,7 +587,7 @@ define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@kern_use_implicitarg_ptr -; AKF_HSA-SAME: () #[[ATTR14:[0-9]+]] { +; AKF_HSA-SAME: () #[[ATTR2]] { ; AKF_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() ; AKF_HSA-NEXT: store volatile i8 addrspace(4)* [[IMPLICITARG_PTR]], i8 addrspace(4)* addrspace(1)* undef, align 8 ; AKF_HSA-NEXT: ret void @@ -605,7 +605,7 @@ define void @use_implicitarg_ptr() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@use_implicitarg_ptr -; AKF_HSA-SAME: () #[[ATTR15:[0-9]+]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() ; AKF_HSA-NEXT: store volatile i8 addrspace(4)* [[IMPLICITARG_PTR]], i8 addrspace(4)* addrspace(1)* undef, align 8 ; AKF_HSA-NEXT: ret void @@ -623,7 +623,7 @@ define void @func_indirect_use_implicitarg_ptr() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_implicitarg_ptr -; AKF_HSA-SAME: () #[[ATTR15]] { +; AKF_HSA-SAME: () #[[ATTR1]] { ; AKF_HSA-NEXT: call void @use_implicitarg_ptr() ; AKF_HSA-NEXT: ret void ; @@ -641,7 +641,7 @@ ; This function gets deleted. define internal void @defined.func() #3 { ; AKF_HSA-LABEL: define {{[^@]+}}@defined.func -; AKF_HSA-SAME: () #[[ATTR16:[0-9]+]] { +; AKF_HSA-SAME: () #[[ATTR4:[0-9]+]] { ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@defined.func @@ -653,7 +653,7 @@ define void @func_call_external() #3 { ; AKF_HSA-LABEL: define {{[^@]+}}@func_call_external -; AKF_HSA-SAME: () #[[ATTR16]] { +; AKF_HSA-SAME: () #[[ATTR4]] { ; AKF_HSA-NEXT: call void @external.func() ; AKF_HSA-NEXT: ret void ; @@ -668,7 +668,7 @@ define void @func_call_defined() #3 { ; AKF_HSA-LABEL: define {{[^@]+}}@func_call_defined -; AKF_HSA-SAME: () #[[ATTR16]] { +; AKF_HSA-SAME: () #[[ATTR4]] { ; AKF_HSA-NEXT: call void @defined.func() ; AKF_HSA-NEXT: ret void ; @@ -682,8 +682,8 @@ } define void @func_call_asm() #3 { ; AKF_HSA-LABEL: define {{[^@]+}}@func_call_asm -; AKF_HSA-SAME: () #[[ATTR17:[0-9]+]] { -; AKF_HSA-NEXT: call void asm sideeffect "", ""() #[[ATTR17]] +; AKF_HSA-SAME: () #[[ATTR5:[0-9]+]] { +; AKF_HSA-NEXT: call void asm sideeffect "", ""() #[[ATTR5]] ; AKF_HSA-NEXT: ret void ; ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_asm @@ -697,7 +697,7 @@ define amdgpu_kernel void @kern_call_external() #3 { ; AKF_HSA-LABEL: define {{[^@]+}}@kern_call_external -; AKF_HSA-SAME: () #[[ATTR18:[0-9]+]] { +; AKF_HSA-SAME: () #[[ATTR6:[0-9]+]] { ; AKF_HSA-NEXT: call void @external.func() ; AKF_HSA-NEXT: ret void ; @@ -712,7 +712,7 @@ define amdgpu_kernel void @func_kern_defined() #3 { ; AKF_HSA-LABEL: define {{[^@]+}}@func_kern_defined -; AKF_HSA-SAME: () #[[ATTR18]] { +; AKF_HSA-SAME: () #[[ATTR6]] { ; AKF_HSA-NEXT: call void @defined.func() ; AKF_HSA-NEXT: ret void ; @@ -727,7 +727,7 @@ define i32 @use_dispatch_ptr_ret_type() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr_ret_type -; AKF_HSA-SAME: () #[[ATTR19:[0-9]+]] { +; AKF_HSA-SAME: () #[[ATTR2]] { ; AKF_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() ; AKF_HSA-NEXT: store volatile i8 addrspace(4)* [[DISPATCH_PTR]], i8 addrspace(4)* addrspace(1)* undef, align 8 ; AKF_HSA-NEXT: ret i32 0 @@ -745,7 +745,7 @@ define float @func_indirect_use_dispatch_ptr_constexpr_cast_func() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr_constexpr_cast_func -; AKF_HSA-SAME: () #[[ATTR19]] { +; AKF_HSA-SAME: () #[[ATTR2]] { ; AKF_HSA-NEXT: [[F:%.*]] = call float bitcast (i32 ()* @use_dispatch_ptr_ret_type to float ()*)() ; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 ; AKF_HSA-NEXT: ret float [[FADD]] @@ -763,7 +763,7 @@ define float @func_indirect_call(float()* %fptr) #3 { ; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_call -; AKF_HSA-SAME: (float ()* [[FPTR:%.*]]) #[[ATTR20:[0-9]+]] { +; AKF_HSA-SAME: (float ()* [[FPTR:%.*]]) #[[ATTR5]] { ; AKF_HSA-NEXT: [[F:%.*]] = call float [[FPTR]]() ; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 ; AKF_HSA-NEXT: ret float [[FADD]] @@ -782,7 +782,7 @@ declare float @extern() #3 define float @func_extern_call() #3 { ; AKF_HSA-LABEL: define {{[^@]+}}@func_extern_call -; AKF_HSA-SAME: () #[[ATTR16]] { +; AKF_HSA-SAME: () #[[ATTR4]] { ; AKF_HSA-NEXT: [[F:%.*]] = call float @extern() ; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 ; AKF_HSA-NEXT: ret float [[FADD]] @@ -800,7 +800,7 @@ define float @func_null_call(float()* %fptr) #3 { ; AKF_HSA-LABEL: define {{[^@]+}}@func_null_call -; AKF_HSA-SAME: (float ()* [[FPTR:%.*]]) #[[ATTR20]] { +; AKF_HSA-SAME: (float ()* [[FPTR:%.*]]) #[[ATTR5]] { ; AKF_HSA-NEXT: [[F:%.*]] = call float null() ; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 ; AKF_HSA-NEXT: ret float [[FADD]] @@ -821,7 +821,7 @@ ; Calls some other recognized intrinsic define float @func_other_intrinsic_call(float %arg) #3 { ; AKF_HSA-LABEL: define {{[^@]+}}@func_other_intrinsic_call -; AKF_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR17]] { +; AKF_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR5]] { ; AKF_HSA-NEXT: [[F:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[ARG]]) ; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00 ; AKF_HSA-NEXT: ret float [[FADD]] @@ -844,26 +844,12 @@ ;. ; AKF_HSA: attributes #[[ATTR0:[0-9]+]] = { nounwind readnone speculatable willreturn } -; AKF_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-work-item-id-x" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; AKF_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-work-item-id-y" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; AKF_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-work-item-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; AKF_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-work-group-id-x" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; AKF_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-work-group-id-y" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; AKF_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-work-group-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; AKF_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-dispatch-ptr" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; AKF_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-queue-ptr" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; AKF_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-dispatch-id" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; AKF_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "target-cpu"="fiji" } -; AKF_HSA: attributes #[[ATTR11]] = { nounwind "target-cpu"="fiji" "uniform-work-group-size"="false" } -; AKF_HSA: attributes #[[ATTR12]] = { nounwind "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; AKF_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-queue-ptr" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; AKF_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-implicitarg-ptr" "target-cpu"="fiji" } -; AKF_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-implicitarg-ptr" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; AKF_HSA: attributes #[[ATTR16]] = { nounwind "uniform-work-group-size"="false" } -; AKF_HSA: attributes #[[ATTR17]] = { nounwind } -; AKF_HSA: attributes #[[ATTR18]] = { nounwind "amdgpu-calls" "uniform-work-group-size"="false" } -; AKF_HSA: attributes #[[ATTR19]] = { nounwind "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-queue-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "target-cpu"="fiji" } -; AKF_HSA: attributes #[[ATTR20]] = { nounwind "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-queue-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } +; AKF_HSA: attributes #[[ATTR1]] = { nounwind "target-cpu"="fiji" "uniform-work-group-size"="false" } +; AKF_HSA: attributes #[[ATTR2]] = { nounwind "target-cpu"="fiji" } +; AKF_HSA: attributes #[[ATTR3]] = { nounwind "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; AKF_HSA: attributes #[[ATTR4]] = { nounwind "uniform-work-group-size"="false" } +; AKF_HSA: attributes #[[ATTR5]] = { nounwind } +; AKF_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-calls" "uniform-work-group-size"="false" } ;. ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nounwind readnone speculatable willreturn } ; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } Index: llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -32,11 +32,17 @@ } define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #1 { -; HSA-LABEL: define {{[^@]+}}@use_tgid_y -; HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR2:[0-9]+]] { -; HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; HSA-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_y +; AKF_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +; AKF_HSA-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_y +; ATTRIBUTOR_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR2:[0-9]+]] { +; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +; ATTRIBUTOR_HSA-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; %val = call i32 @llvm.amdgcn.workgroup.id.y() store i32 %val, i32 addrspace(1)* %ptr @@ -44,13 +50,21 @@ } define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 { -; HSA-LABEL: define {{[^@]+}}@multi_use_tgid_y -; HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR2]] { -; HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; HSA-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; HSA-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@multi_use_tgid_y +; AKF_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +; AKF_HSA-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +; AKF_HSA-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@multi_use_tgid_y +; ATTRIBUTOR_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR2]] { +; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; %val0 = call i32 @llvm.amdgcn.workgroup.id.y() store volatile i32 %val0, i32 addrspace(1)* %ptr @@ -60,13 +74,21 @@ } define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 { -; HSA-LABEL: define {{[^@]+}}@use_tgid_x_y -; HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR2]] { -; HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() -; HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; HSA-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_x_y +; AKF_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +; AKF_HSA-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_x_y +; ATTRIBUTOR_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR2]] { +; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +; ATTRIBUTOR_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; %val0 = call i32 @llvm.amdgcn.workgroup.id.x() %val1 = call i32 @llvm.amdgcn.workgroup.id.y() @@ -76,11 +98,17 @@ } define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #1 { -; HSA-LABEL: define {{[^@]+}}@use_tgid_z -; HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR3:[0-9]+]] { -; HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; HSA-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_z +; AKF_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() +; AKF_HSA-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_z +; ATTRIBUTOR_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR3:[0-9]+]] { +; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() +; ATTRIBUTOR_HSA-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; %val = call i32 @llvm.amdgcn.workgroup.id.z() store i32 %val, i32 addrspace(1)* %ptr @@ -88,13 +116,21 @@ } define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 { -; HSA-LABEL: define {{[^@]+}}@use_tgid_x_z -; HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR3]] { -; HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() -; HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; HSA-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_x_z +; AKF_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() +; AKF_HSA-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_x_z +; ATTRIBUTOR_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR3]] { +; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +; ATTRIBUTOR_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; %val0 = call i32 @llvm.amdgcn.workgroup.id.x() %val1 = call i32 @llvm.amdgcn.workgroup.id.z() @@ -104,13 +140,21 @@ } define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 { -; HSA-LABEL: define {{[^@]+}}@use_tgid_y_z -; HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR4:[0-9]+]] { -; HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; HSA-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_y_z +; AKF_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() +; AKF_HSA-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_y_z +; ATTRIBUTOR_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR4:[0-9]+]] { +; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +; ATTRIBUTOR_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; %val0 = call i32 @llvm.amdgcn.workgroup.id.y() %val1 = call i32 @llvm.amdgcn.workgroup.id.z() @@ -120,15 +164,25 @@ } define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 { -; HSA-LABEL: define {{[^@]+}}@use_tgid_x_y_z -; HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR4]] { -; HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() -; HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; HSA-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: store volatile i32 [[VAL2]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_x_y_z +; AKF_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +; AKF_HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() +; AKF_HSA-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: store volatile i32 [[VAL2]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_x_y_z +; ATTRIBUTOR_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR4]] { +; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +; ATTRIBUTOR_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +; ATTRIBUTOR_HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL2]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; %val0 = call i32 @llvm.amdgcn.workgroup.id.x() %val1 = call i32 @llvm.amdgcn.workgroup.id.y() @@ -152,11 +206,17 @@ } define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #1 { -; HSA-LABEL: define {{[^@]+}}@use_tidig_y -; HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR5:[0-9]+]] { -; HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; HSA-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@use_tidig_y +; AKF_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; AKF_HSA-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tidig_y +; ATTRIBUTOR_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR5:[0-9]+]] { +; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; ATTRIBUTOR_HSA-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; %val = call i32 @llvm.amdgcn.workitem.id.y() store i32 %val, i32 addrspace(1)* %ptr @@ -164,11 +224,17 @@ } define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #1 { -; HSA-LABEL: define {{[^@]+}}@use_tidig_z -; HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR6:[0-9]+]] { -; HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() -; HSA-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@use_tidig_z +; AKF_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() +; AKF_HSA-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tidig_z +; ATTRIBUTOR_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR6:[0-9]+]] { +; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() +; ATTRIBUTOR_HSA-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; %val = call i32 @llvm.amdgcn.workitem.id.z() store i32 %val, i32 addrspace(1)* %ptr @@ -192,13 +258,21 @@ } define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 { -; HSA-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y -; HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR7:[0-9]+]] { -; HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; HSA-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y +; AKF_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +; AKF_HSA-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y +; ATTRIBUTOR_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR7:[0-9]+]] { +; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; ATTRIBUTOR_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; %val0 = call i32 @llvm.amdgcn.workitem.id.y() %val1 = call i32 @llvm.amdgcn.workgroup.id.y() @@ -208,15 +282,25 @@ } define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 { -; HSA-LABEL: define {{[^@]+}}@use_tidig_x_y_z -; HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR8:[0-9]+]] { -; HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() -; HSA-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: store volatile i32 [[VAL2]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@use_tidig_x_y_z +; AKF_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; AKF_HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() +; AKF_HSA-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: store volatile i32 [[VAL2]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tidig_x_y_z +; ATTRIBUTOR_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR8:[0-9]+]] { +; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; ATTRIBUTOR_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; ATTRIBUTOR_HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL2]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; %val0 = call i32 @llvm.amdgcn.workitem.id.x() %val1 = call i32 @llvm.amdgcn.workitem.id.y() @@ -228,21 +312,37 @@ } define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #1 { -; HSA-LABEL: define {{[^@]+}}@use_all_workitems -; HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR9:[0-9]+]] { -; HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() -; HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() -; HSA-NEXT: [[VAL3:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() -; HSA-NEXT: [[VAL4:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() -; HSA-NEXT: [[VAL5:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() -; HSA-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: store volatile i32 [[VAL2]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: store volatile i32 [[VAL3]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: store volatile i32 [[VAL4]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: store volatile i32 [[VAL5]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@use_all_workitems +; AKF_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; AKF_HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() +; AKF_HSA-NEXT: [[VAL3:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +; AKF_HSA-NEXT: [[VAL4:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +; AKF_HSA-NEXT: [[VAL5:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() +; AKF_HSA-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: store volatile i32 [[VAL2]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: store volatile i32 [[VAL3]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: store volatile i32 [[VAL4]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: store volatile i32 [[VAL5]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_all_workitems +; ATTRIBUTOR_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR9:[0-9]+]] { +; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; ATTRIBUTOR_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; ATTRIBUTOR_HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() +; ATTRIBUTOR_HSA-NEXT: [[VAL3:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x() +; ATTRIBUTOR_HSA-NEXT: [[VAL4:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y() +; ATTRIBUTOR_HSA-NEXT: [[VAL5:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z() +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL2]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL3]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL4]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: store volatile i32 [[VAL5]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; %val0 = call i32 @llvm.amdgcn.workitem.id.x() %val1 = call i32 @llvm.amdgcn.workitem.id.y() @@ -260,13 +360,21 @@ } define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 { -; HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr -; HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR10:[0-9]+]] { -; HSA-NEXT: [[DISPATCH_PTR:%.*]] = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() -; HSA-NEXT: [[BC:%.*]] = bitcast i8 addrspace(4)* [[DISPATCH_PTR]] to i32 addrspace(4)* -; HSA-NEXT: [[VAL:%.*]] = load i32, i32 addrspace(4)* [[BC]], align 4 -; HSA-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr +; AKF_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() +; AKF_HSA-NEXT: [[BC:%.*]] = bitcast i8 addrspace(4)* [[DISPATCH_PTR]] to i32 addrspace(4)* +; AKF_HSA-NEXT: [[VAL:%.*]] = load i32, i32 addrspace(4)* [[BC]], align 4 +; AKF_HSA-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr +; ATTRIBUTOR_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR10:[0-9]+]] { +; ATTRIBUTOR_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() +; ATTRIBUTOR_HSA-NEXT: [[BC:%.*]] = bitcast i8 addrspace(4)* [[DISPATCH_PTR]] to i32 addrspace(4)* +; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = load i32, i32 addrspace(4)* [[BC]], align 4 +; ATTRIBUTOR_HSA-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() %bc = bitcast i8 addrspace(4)* %dispatch.ptr to i32 addrspace(4)* @@ -276,13 +384,21 @@ } define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 { -; HSA-LABEL: define {{[^@]+}}@use_queue_ptr -; HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR11:[0-9]+]] { -; HSA-NEXT: [[DISPATCH_PTR:%.*]] = call i8 addrspace(4)* @llvm.amdgcn.queue.ptr() -; HSA-NEXT: [[BC:%.*]] = bitcast i8 addrspace(4)* [[DISPATCH_PTR]] to i32 addrspace(4)* -; HSA-NEXT: [[VAL:%.*]] = load i32, i32 addrspace(4)* [[BC]], align 4 -; HSA-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@use_queue_ptr +; AKF_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call i8 addrspace(4)* @llvm.amdgcn.queue.ptr() +; AKF_HSA-NEXT: [[BC:%.*]] = bitcast i8 addrspace(4)* [[DISPATCH_PTR]] to i32 addrspace(4)* +; AKF_HSA-NEXT: [[VAL:%.*]] = load i32, i32 addrspace(4)* [[BC]], align 4 +; AKF_HSA-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_queue_ptr +; ATTRIBUTOR_HSA-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR11:[0-9]+]] { +; ATTRIBUTOR_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call i8 addrspace(4)* @llvm.amdgcn.queue.ptr() +; ATTRIBUTOR_HSA-NEXT: [[BC:%.*]] = bitcast i8 addrspace(4)* [[DISPATCH_PTR]] to i32 addrspace(4)* +; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = load i32, i32 addrspace(4)* [[BC]], align 4 +; ATTRIBUTOR_HSA-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.queue.ptr() %bc = bitcast i8 addrspace(4)* %dispatch.ptr to i32 addrspace(4)* @@ -308,11 +424,17 @@ } define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 { -; HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast -; HSA-SAME: (i32 addrspace(3)* [[PTR:%.*]]) #[[ATTR11]] { -; HSA-NEXT: [[STOF:%.*]] = addrspacecast i32 addrspace(3)* [[PTR]] to i32* -; HSA-NEXT: store volatile i32 0, i32* [[STOF]], align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast +; AKF_HSA-SAME: (i32 addrspace(3)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast i32 addrspace(3)* [[PTR]] to i32* +; AKF_HSA-NEXT: store volatile i32 0, i32* [[STOF]], align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast +; ATTRIBUTOR_HSA-SAME: (i32 addrspace(3)* [[PTR:%.*]]) #[[ATTR11]] { +; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast i32 addrspace(3)* [[PTR]] to i32* +; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, i32* [[STOF]], align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; %stof = addrspacecast i32 addrspace(3)* %ptr to i32* store volatile i32 0, i32* %stof @@ -320,11 +442,17 @@ } define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32 addrspace(5)* %ptr) #1 { -; HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast -; HSA-SAME: (i32 addrspace(5)* [[PTR:%.*]]) #[[ATTR11]] { -; HSA-NEXT: [[STOF:%.*]] = addrspacecast i32 addrspace(5)* [[PTR]] to i32* -; HSA-NEXT: store volatile i32 0, i32* [[STOF]], align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast +; AKF_HSA-SAME: (i32 addrspace(5)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast i32 addrspace(5)* [[PTR]] to i32* +; AKF_HSA-NEXT: store volatile i32 0, i32* [[STOF]], align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast +; ATTRIBUTOR_HSA-SAME: (i32 addrspace(5)* [[PTR:%.*]]) #[[ATTR11]] { +; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast i32 addrspace(5)* [[PTR]] to i32* +; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, i32* [[STOF]], align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; %stof = addrspacecast i32 addrspace(5)* %ptr to i32* store volatile i32 0, i32* %stof @@ -405,12 +533,19 @@ } define amdgpu_kernel void @use_is_shared(i8* %ptr) #1 { -; HSA-LABEL: define {{[^@]+}}@use_is_shared -; HSA-SAME: (i8* [[PTR:%.*]]) #[[ATTR11]] { -; HSA-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(i8* [[PTR]]) -; HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_SHARED]] to i32 -; HSA-NEXT: store i32 [[EXT]], i32 addrspace(1)* undef, align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@use_is_shared +; AKF_HSA-SAME: (i8* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(i8* [[PTR]]) +; AKF_HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_SHARED]] to i32 +; AKF_HSA-NEXT: store i32 [[EXT]], i32 addrspace(1)* undef, align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_shared +; ATTRIBUTOR_HSA-SAME: (i8* [[PTR:%.*]]) #[[ATTR11]] { +; ATTRIBUTOR_HSA-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(i8* [[PTR]]) +; ATTRIBUTOR_HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_SHARED]] to i32 +; ATTRIBUTOR_HSA-NEXT: store i32 [[EXT]], i32 addrspace(1)* undef, align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; %is.shared = call i1 @llvm.amdgcn.is.shared(i8* %ptr) %ext = zext i1 %is.shared to i32 @@ -419,12 +554,19 @@ } define amdgpu_kernel void @use_is_private(i8* %ptr) #1 { -; HSA-LABEL: define {{[^@]+}}@use_is_private -; HSA-SAME: (i8* [[PTR:%.*]]) #[[ATTR11]] { -; HSA-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(i8* [[PTR]]) -; HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_PRIVATE]] to i32 -; HSA-NEXT: store i32 [[EXT]], i32 addrspace(1)* undef, align 4 -; HSA-NEXT: ret void +; AKF_HSA-LABEL: define {{[^@]+}}@use_is_private +; AKF_HSA-SAME: (i8* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_HSA-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(i8* [[PTR]]) +; AKF_HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_PRIVATE]] to i32 +; AKF_HSA-NEXT: store i32 [[EXT]], i32 addrspace(1)* undef, align 4 +; AKF_HSA-NEXT: ret void +; +; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_private +; ATTRIBUTOR_HSA-SAME: (i8* [[PTR:%.*]]) #[[ATTR11]] { +; ATTRIBUTOR_HSA-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(i8* [[PTR]]) +; ATTRIBUTOR_HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_PRIVATE]] to i32 +; ATTRIBUTOR_HSA-NEXT: store i32 [[EXT]], i32 addrspace(1)* undef, align 4 +; ATTRIBUTOR_HSA-NEXT: ret void ; %is.private = call i1 @llvm.amdgcn.is.private(i8* %ptr) %ext = zext i1 %is.private to i32 @@ -434,7 +576,7 @@ define amdgpu_kernel void @use_alloca() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@use_alloca -; AKF_HSA-SAME: () #[[ATTR12:[0-9]+]] { +; AKF_HSA-SAME: () #[[ATTR2:[0-9]+]] { ; AKF_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) ; AKF_HSA-NEXT: store i32 0, i32 addrspace(5)* [[ALLOCA]], align 4 ; AKF_HSA-NEXT: ret void @@ -452,7 +594,7 @@ define amdgpu_kernel void @use_alloca_non_entry_block() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@use_alloca_non_entry_block -; AKF_HSA-SAME: () #[[ATTR12]] { +; AKF_HSA-SAME: () #[[ATTR2]] { ; AKF_HSA-NEXT: entry: ; AKF_HSA-NEXT: br label [[BB:%.*]] ; AKF_HSA: bb: @@ -480,7 +622,7 @@ define void @use_alloca_func() #1 { ; AKF_HSA-LABEL: define {{[^@]+}}@use_alloca_func -; AKF_HSA-SAME: () #[[ATTR12]] { +; AKF_HSA-SAME: () #[[ATTR2]] { ; AKF_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5) ; AKF_HSA-NEXT: store i32 0, i32 addrspace(5)* [[ALLOCA]], align 4 ; AKF_HSA-NEXT: ret void @@ -502,17 +644,7 @@ ;. ; AKF_HSA: attributes #[[ATTR0:[0-9]+]] = { nounwind readnone speculatable willreturn } ; AKF_HSA: attributes #[[ATTR1]] = { nounwind } -; AKF_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-work-group-id-y" } -; AKF_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-work-group-id-z" } -; AKF_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" } -; AKF_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-work-item-id-y" } -; AKF_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-work-item-id-z" } -; AKF_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-item-id-y" } -; AKF_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } -; AKF_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } -; AKF_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-dispatch-ptr" } -; AKF_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-queue-ptr" } -; AKF_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-stack-objects" } +; AKF_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-stack-objects" } ;. ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nounwind readnone speculatable willreturn } ; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } Index: llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll +++ llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll @@ -27,11 +27,17 @@ } define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #1 { -; CHECK-LABEL: define {{[^@]+}}@use_tgid_y -; CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR2:[0-9]+]] { -; CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.y() -; CHECK-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 -; CHECK-NEXT: ret void +; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_y +; AKF_CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.y() +; AKF_CHECK-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_CHECK-NEXT: ret void +; +; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_y +; ATTRIBUTOR_CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR2:[0-9]+]] { +; ATTRIBUTOR_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.y() +; ATTRIBUTOR_CHECK-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_CHECK-NEXT: ret void ; %val = call i32 @llvm.r600.read.tgid.y() store i32 %val, i32 addrspace(1)* %ptr @@ -39,13 +45,21 @@ } define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 { -; CHECK-LABEL: define {{[^@]+}}@multi_use_tgid_y -; CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR2]] { -; CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y() -; CHECK-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 -; CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() -; CHECK-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 -; CHECK-NEXT: ret void +; AKF_CHECK-LABEL: define {{[^@]+}}@multi_use_tgid_y +; AKF_CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y() +; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() +; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_CHECK-NEXT: ret void +; +; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@multi_use_tgid_y +; ATTRIBUTOR_CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR2]] { +; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y() +; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() +; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_CHECK-NEXT: ret void ; %val0 = call i32 @llvm.r600.read.tgid.y() store volatile i32 %val0, i32 addrspace(1)* %ptr @@ -55,13 +69,21 @@ } define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 { -; CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y -; CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR2]] { -; CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() -; CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() -; CHECK-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 -; CHECK-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 -; CHECK-NEXT: ret void +; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y +; AKF_CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() +; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() +; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_CHECK-NEXT: ret void +; +; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y +; ATTRIBUTOR_CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR2]] { +; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() +; ATTRIBUTOR_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() +; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_CHECK-NEXT: ret void ; %val0 = call i32 @llvm.r600.read.tgid.x() %val1 = call i32 @llvm.r600.read.tgid.y() @@ -71,11 +93,17 @@ } define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #1 { -; CHECK-LABEL: define {{[^@]+}}@use_tgid_z -; CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR3:[0-9]+]] { -; CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.z() -; CHECK-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 -; CHECK-NEXT: ret void +; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_z +; AKF_CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.z() +; AKF_CHECK-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_CHECK-NEXT: ret void +; +; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_z +; ATTRIBUTOR_CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR3:[0-9]+]] { +; ATTRIBUTOR_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.z() +; ATTRIBUTOR_CHECK-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_CHECK-NEXT: ret void ; %val = call i32 @llvm.r600.read.tgid.z() store i32 %val, i32 addrspace(1)* %ptr @@ -83,13 +111,21 @@ } define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 { -; CHECK-LABEL: define {{[^@]+}}@use_tgid_x_z -; CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR3]] { -; CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() -; CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.z() -; CHECK-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 -; CHECK-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 -; CHECK-NEXT: ret void +; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_z +; AKF_CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() +; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.z() +; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_CHECK-NEXT: ret void +; +; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_z +; ATTRIBUTOR_CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR3]] { +; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() +; ATTRIBUTOR_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.z() +; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_CHECK-NEXT: ret void ; %val0 = call i32 @llvm.r600.read.tgid.x() %val1 = call i32 @llvm.r600.read.tgid.z() @@ -99,13 +135,21 @@ } define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 { -; CHECK-LABEL: define {{[^@]+}}@use_tgid_y_z -; CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR4:[0-9]+]] { -; CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y() -; CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.z() -; CHECK-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 -; CHECK-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 -; CHECK-NEXT: ret void +; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_y_z +; AKF_CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y() +; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.z() +; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_CHECK-NEXT: ret void +; +; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_y_z +; ATTRIBUTOR_CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR4:[0-9]+]] { +; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y() +; ATTRIBUTOR_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.z() +; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_CHECK-NEXT: ret void ; %val0 = call i32 @llvm.r600.read.tgid.y() %val1 = call i32 @llvm.r600.read.tgid.z() @@ -115,15 +159,25 @@ } define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 { -; CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y_z -; CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR4]] { -; CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() -; CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() -; CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tgid.z() -; CHECK-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 -; CHECK-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 -; CHECK-NEXT: store volatile i32 [[VAL2]], i32 addrspace(1)* [[PTR]], align 4 -; CHECK-NEXT: ret void +; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y_z +; AKF_CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() +; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() +; AKF_CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tgid.z() +; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_CHECK-NEXT: store volatile i32 [[VAL2]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_CHECK-NEXT: ret void +; +; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y_z +; ATTRIBUTOR_CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR4]] { +; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x() +; ATTRIBUTOR_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() +; ATTRIBUTOR_CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tgid.z() +; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL2]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_CHECK-NEXT: ret void ; %val0 = call i32 @llvm.r600.read.tgid.x() %val1 = call i32 @llvm.r600.read.tgid.y() @@ -147,11 +201,17 @@ } define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #1 { -; CHECK-LABEL: define {{[^@]+}}@use_tidig_y -; CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR5:[0-9]+]] { -; CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.y() -; CHECK-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 -; CHECK-NEXT: ret void +; AKF_CHECK-LABEL: define {{[^@]+}}@use_tidig_y +; AKF_CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.y() +; AKF_CHECK-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_CHECK-NEXT: ret void +; +; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tidig_y +; ATTRIBUTOR_CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR5:[0-9]+]] { +; ATTRIBUTOR_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.y() +; ATTRIBUTOR_CHECK-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_CHECK-NEXT: ret void ; %val = call i32 @llvm.r600.read.tidig.y() store i32 %val, i32 addrspace(1)* %ptr @@ -159,11 +219,17 @@ } define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #1 { -; CHECK-LABEL: define {{[^@]+}}@use_tidig_z -; CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR6:[0-9]+]] { -; CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.z() -; CHECK-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 -; CHECK-NEXT: ret void +; AKF_CHECK-LABEL: define {{[^@]+}}@use_tidig_z +; AKF_CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.z() +; AKF_CHECK-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_CHECK-NEXT: ret void +; +; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tidig_z +; ATTRIBUTOR_CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR6:[0-9]+]] { +; ATTRIBUTOR_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.z() +; ATTRIBUTOR_CHECK-NEXT: store i32 [[VAL]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_CHECK-NEXT: ret void ; %val = call i32 @llvm.r600.read.tidig.z() store i32 %val, i32 addrspace(1)* %ptr @@ -187,13 +253,21 @@ } define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 { -; CHECK-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y -; CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR7:[0-9]+]] { -; CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.y() -; CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() -; CHECK-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 -; CHECK-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 -; CHECK-NEXT: ret void +; AKF_CHECK-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y +; AKF_CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.y() +; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() +; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_CHECK-NEXT: ret void +; +; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y +; ATTRIBUTOR_CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR7:[0-9]+]] { +; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.y() +; ATTRIBUTOR_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y() +; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_CHECK-NEXT: ret void ; %val0 = call i32 @llvm.r600.read.tidig.y() %val1 = call i32 @llvm.r600.read.tgid.y() @@ -203,15 +277,25 @@ } define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 { -; CHECK-LABEL: define {{[^@]+}}@use_tidig_x_y_z -; CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR8:[0-9]+]] { -; CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x() -; CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tidig.y() -; CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tidig.z() -; CHECK-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 -; CHECK-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 -; CHECK-NEXT: store volatile i32 [[VAL2]], i32 addrspace(1)* [[PTR]], align 4 -; CHECK-NEXT: ret void +; AKF_CHECK-LABEL: define {{[^@]+}}@use_tidig_x_y_z +; AKF_CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x() +; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tidig.y() +; AKF_CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tidig.z() +; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_CHECK-NEXT: store volatile i32 [[VAL2]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_CHECK-NEXT: ret void +; +; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tidig_x_y_z +; ATTRIBUTOR_CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR8:[0-9]+]] { +; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x() +; ATTRIBUTOR_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tidig.y() +; ATTRIBUTOR_CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tidig.z() +; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL2]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_CHECK-NEXT: ret void ; %val0 = call i32 @llvm.r600.read.tidig.x() %val1 = call i32 @llvm.r600.read.tidig.y() @@ -223,21 +307,37 @@ } define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #1 { -; CHECK-LABEL: define {{[^@]+}}@use_all_workitems -; CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR9:[0-9]+]] { -; CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x() -; CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tidig.y() -; CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tidig.z() -; CHECK-NEXT: [[VAL3:%.*]] = call i32 @llvm.r600.read.tgid.x() -; CHECK-NEXT: [[VAL4:%.*]] = call i32 @llvm.r600.read.tgid.y() -; CHECK-NEXT: [[VAL5:%.*]] = call i32 @llvm.r600.read.tgid.z() -; CHECK-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 -; CHECK-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 -; CHECK-NEXT: store volatile i32 [[VAL2]], i32 addrspace(1)* [[PTR]], align 4 -; CHECK-NEXT: store volatile i32 [[VAL3]], i32 addrspace(1)* [[PTR]], align 4 -; CHECK-NEXT: store volatile i32 [[VAL4]], i32 addrspace(1)* [[PTR]], align 4 -; CHECK-NEXT: store volatile i32 [[VAL5]], i32 addrspace(1)* [[PTR]], align 4 -; CHECK-NEXT: ret void +; AKF_CHECK-LABEL: define {{[^@]+}}@use_all_workitems +; AKF_CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR1]] { +; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x() +; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tidig.y() +; AKF_CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tidig.z() +; AKF_CHECK-NEXT: [[VAL3:%.*]] = call i32 @llvm.r600.read.tgid.x() +; AKF_CHECK-NEXT: [[VAL4:%.*]] = call i32 @llvm.r600.read.tgid.y() +; AKF_CHECK-NEXT: [[VAL5:%.*]] = call i32 @llvm.r600.read.tgid.z() +; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_CHECK-NEXT: store volatile i32 [[VAL2]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_CHECK-NEXT: store volatile i32 [[VAL3]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_CHECK-NEXT: store volatile i32 [[VAL4]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_CHECK-NEXT: store volatile i32 [[VAL5]], i32 addrspace(1)* [[PTR]], align 4 +; AKF_CHECK-NEXT: ret void +; +; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_all_workitems +; ATTRIBUTOR_CHECK-SAME: (i32 addrspace(1)* [[PTR:%.*]]) #[[ATTR9:[0-9]+]] { +; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x() +; ATTRIBUTOR_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tidig.y() +; ATTRIBUTOR_CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tidig.z() +; ATTRIBUTOR_CHECK-NEXT: [[VAL3:%.*]] = call i32 @llvm.r600.read.tgid.x() +; ATTRIBUTOR_CHECK-NEXT: [[VAL4:%.*]] = call i32 @llvm.r600.read.tgid.y() +; ATTRIBUTOR_CHECK-NEXT: [[VAL5:%.*]] = call i32 @llvm.r600.read.tgid.z() +; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL0]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL1]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL2]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL3]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL4]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_CHECK-NEXT: store volatile i32 [[VAL5]], i32 addrspace(1)* [[PTR]], align 4 +; ATTRIBUTOR_CHECK-NEXT: ret void ; %val0 = call i32 @llvm.r600.read.tidig.x() %val1 = call i32 @llvm.r600.read.tidig.y() @@ -316,14 +416,6 @@ ;. ; AKF_CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind readnone speculatable willreturn } ; AKF_CHECK: attributes #[[ATTR1]] = { nounwind } -; AKF_CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-work-group-id-y" } -; AKF_CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-work-group-id-z" } -; AKF_CHECK: attributes #[[ATTR4]] = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" } -; AKF_CHECK: attributes #[[ATTR5]] = { nounwind "amdgpu-work-item-id-y" } -; AKF_CHECK: attributes #[[ATTR6]] = { nounwind "amdgpu-work-item-id-z" } -; AKF_CHECK: attributes #[[ATTR7]] = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-item-id-y" } -; AKF_CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } -; AKF_CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } ;. ; ATTRIBUTOR_CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind readnone speculatable willreturn } ; ATTRIBUTOR_CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } Index: llvm/test/CodeGen/AMDGPU/call-constexpr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/call-constexpr.ll +++ llvm/test/CodeGen/AMDGPU/call-constexpr.ll @@ -75,10 +75,10 @@ } ; GCN-LABEL: {{^}}test_bitcast_use_workitem_id_x: +; GCN: v_mov_b32_e32 v1, v0 ; GCN: s_getpc_b64 ; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@lo+4 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@hi+12 -; GCN: v_or_b32_e32 v1, v0 ; GCN: v_mov_b32_e32 v0, 9 ; GCN: s_swappc_b64 ; GCN: v_add_f32_e32 Index: llvm/test/CodeGen/AMDGPU/call-waitcnt.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/call-waitcnt.ll +++ llvm/test/CodeGen/AMDGPU/call-waitcnt.ll @@ -5,10 +5,10 @@ define amdgpu_kernel void @call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0 { ; GCN-LABEL: call_memory_arg_load: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_load_dword s4, s[8:9], 0x0 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 @@ -28,10 +28,10 @@ define amdgpu_kernel void @call_memory_no_dep(i32 addrspace(1)* %ptr, i32) #0 { ; GCN-LABEL: call_memory_no_dep: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -52,10 +52,10 @@ define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32) #0 { ; GCN-LABEL: call_no_wait_after_call: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GCN-NEXT: s_load_dwordx2 s[34:35], s[8:9], 0x0 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_getpc_b64 s[4:5] @@ -74,10 +74,10 @@ define amdgpu_kernel void @call_no_wait_after_call_return_val(i32 addrspace(1)* %ptr, i32) #0 { ; GCN-LABEL: call_no_wait_after_call_return_val: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GCN-NEXT: s_load_dwordx2 s[34:35], s[8:9], 0x0 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_getpc_b64 s[4:5] @@ -97,9 +97,9 @@ define amdgpu_kernel void @call_got_load(i32 addrspace(1)* %ptr, i32) #0 { ; GCN-LABEL: call_got_load: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, got.func@gotpcrel32@lo+4 Index: llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll +++ llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll @@ -253,7 +253,33 @@ ; GCN: .amdhsa_system_sgpr_workgroup_id_z 1 ; GCN: .amdhsa_system_sgpr_workgroup_info 0 ; GCN: .amdhsa_system_vgpr_workitem_id 2 -define amdgpu_kernel void @kern_indirect_use_every_sgpr_input() #1 { +define amdgpu_kernel void @kern_indirect_use_every_sgpr_input(i8) #1 { + call void @use_every_sgpr_input() + ret void +} + +; We have to pass the kernarg segment, but there are no kernel +; arguments so null is passed. +; GCN-LABEL: {{^}}kern_indirect_use_every_sgpr_input_no_kernargs: +; GCN: s_mov_b64 s[10:11], s[8:9] +; GCN: s_mov_b64 s[8:9], 0{{$}} +; GCN: s_mov_b32 s32, 0 +; GCN: s_swappc_b64 + +; GCN: .amdhsa_user_sgpr_private_segment_buffer 1 +; GCN: .amdhsa_user_sgpr_dispatch_ptr 1 +; GCN: .amdhsa_user_sgpr_queue_ptr 1 +; GCN: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; GCN: .amdhsa_user_sgpr_dispatch_id 1 +; GCN: .amdhsa_user_sgpr_flat_scratch_init 1 +; GCN: .amdhsa_user_sgpr_private_segment_size 0 +; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 +; GCN: .amdhsa_system_sgpr_workgroup_id_x 1 +; GCN: .amdhsa_system_sgpr_workgroup_id_y 1 +; GCN: .amdhsa_system_sgpr_workgroup_id_z 1 +; GCN: .amdhsa_system_sgpr_workgroup_info 0 +; GCN: .amdhsa_system_vgpr_workitem_id 2 +define amdgpu_kernel void @kern_indirect_use_every_sgpr_input_no_kernargs() #1 { call void @use_every_sgpr_input() ret void } Index: llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll +++ llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -454,7 +454,31 @@ ; GCN: s_mov_b32 s14, s16 ; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 -define amdgpu_kernel void @kern_indirect_use_every_sgpr_input() #1 { +define amdgpu_kernel void @kern_indirect_use_every_sgpr_input(i8) #1 { + call void @use_every_sgpr_input() + ret void +} + +; We have to pass the kernarg segment, but there are no kernel +; arguments so null is passed. +; GCN-LABEL: {{^}}kern_indirect_use_every_sgpr_input_no_kernargs: +; GCN: enable_sgpr_workgroup_id_x = 1 +; GCN: enable_sgpr_workgroup_id_y = 1 +; GCN: enable_sgpr_workgroup_id_z = 1 +; GCN: enable_sgpr_workgroup_info = 0 + +; GCN: enable_sgpr_private_segment_buffer = 1 +; GCN: enable_sgpr_dispatch_ptr = 1 +; GCN: enable_sgpr_queue_ptr = 1 +; GCN: enable_sgpr_kernarg_segment_ptr = 0 +; GCN: enable_sgpr_dispatch_id = 1 +; GCN: enable_sgpr_flat_scratch_init = 1 + +; GCN: s_mov_b64 s[10:11], s[8:9] +; GCN: s_mov_b64 s[8:9], 0{{$}} +; GCN: s_mov_b32 s32, 0 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_every_sgpr_input_no_kernargs() #1 { call void @use_every_sgpr_input() ret void } Index: llvm/test/CodeGen/AMDGPU/cc-update.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cc-update.ll +++ llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -55,23 +55,23 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 { ; GFX803-LABEL: test_kern_call: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_i32 s4, s4, s7 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; GFX803-NEXT: s_add_u32 s0, s0, s7 +; GFX803-NEXT: s_add_i32 s10, s10, s15 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX803-NEXT: s_add_u32 s0, s0, s15 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 ; GFX803-NEXT: s_getpc_b64 s[4:5] ; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 ; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 ; GFX803-NEXT: s_mov_b32 s32, 0 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX803-NEXT: s_endpgm ; ; GFX900-LABEL: test_kern_call: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; GFX900-NEXT: s_add_u32 s0, s0, s7 +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX900-NEXT: s_add_u32 s0, s0, s15 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: s_getpc_b64 s[4:5] ; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 @@ -82,12 +82,12 @@ ; ; GFX1010-LABEL: test_kern_call: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_add_u32 s4, s4, s7 +; GFX1010-NEXT: s_add_u32 s10, s10, s15 ; GFX1010-NEXT: s_mov_b32 s32, 0 -; GFX1010-NEXT: s_addc_u32 s5, s5, 0 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 -; GFX1010-NEXT: s_add_u32 s0, s0, s7 +; GFX1010-NEXT: s_addc_u32 s11, s11, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1010-NEXT: s_add_u32 s0, s0, s15 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: s_getpc_b64 s[4:5] ; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 @@ -102,16 +102,16 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { ; GFX803-LABEL: test_kern_stack_and_call: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_i32 s4, s4, s7 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; GFX803-NEXT: s_add_u32 s0, s0, s7 +; GFX803-NEXT: s_add_i32 s10, s10, s15 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX803-NEXT: s_add_u32 s0, s0, s15 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: v_mov_b32_e32 v0, 0 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 ; GFX803-NEXT: s_getpc_b64 s[4:5] ; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 ; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 ; GFX803-NEXT: s_movk_i32 s32, 0x400 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -119,9 +119,9 @@ ; ; GFX900-LABEL: test_kern_stack_and_call: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; GFX900-NEXT: s_add_u32 s0, s0, s7 +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX900-NEXT: s_add_u32 s0, s0, s15 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 ; GFX900-NEXT: s_getpc_b64 s[4:5] @@ -135,13 +135,13 @@ ; ; GFX1010-LABEL: test_kern_stack_and_call: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_add_u32 s4, s4, s7 +; GFX1010-NEXT: s_add_u32 s10, s10, s15 ; GFX1010-NEXT: s_movk_i32 s32, 0x200 -; GFX1010-NEXT: s_addc_u32 s5, s5, 0 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1010-NEXT: s_addc_u32 s11, s11, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 -; GFX1010-NEXT: s_add_u32 s0, s0, s7 +; GFX1010-NEXT: s_add_u32 s0, s0, s15 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: s_getpc_b64 s[4:5] ; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 @@ -215,24 +215,24 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 { ; GFX803-LABEL: test_force_fp_kern_call: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_i32 s4, s4, s7 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; GFX803-NEXT: s_add_u32 s0, s0, s7 +; GFX803-NEXT: s_add_i32 s10, s10, s15 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX803-NEXT: s_add_u32 s0, s0, s15 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 ; GFX803-NEXT: s_getpc_b64 s[4:5] ; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 ; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 ; GFX803-NEXT: s_mov_b32 s32, 0 ; GFX803-NEXT: s_mov_b32 s33, 0 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX803-NEXT: s_endpgm ; ; GFX900-LABEL: test_force_fp_kern_call: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; GFX900-NEXT: s_add_u32 s0, s0, s7 +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX900-NEXT: s_add_u32 s0, s0, s15 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: s_getpc_b64 s[4:5] ; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 @@ -244,13 +244,13 @@ ; ; GFX1010-LABEL: test_force_fp_kern_call: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_add_u32 s4, s4, s7 +; GFX1010-NEXT: s_add_u32 s10, s10, s15 ; GFX1010-NEXT: s_mov_b32 s32, 0 ; GFX1010-NEXT: s_mov_b32 s33, 0 -; GFX1010-NEXT: s_addc_u32 s5, s5, 0 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 -; GFX1010-NEXT: s_add_u32 s0, s0, s7 +; GFX1010-NEXT: s_addc_u32 s11, s11, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 +; GFX1010-NEXT: s_add_u32 s0, s0, s15 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: s_getpc_b64 s[4:5] ; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 @@ -265,17 +265,17 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_addr #2 { ; GFX803-LABEL: test_force_fp_kern_stack_and_call: ; GFX803: ; %bb.0: ; %entry -; GFX803-NEXT: s_add_i32 s4, s4, s7 -; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; GFX803-NEXT: s_add_u32 s0, s0, s7 -; GFX803-NEXT: s_mov_b32 s33, 0 +; GFX803-NEXT: s_add_i32 s10, s10, s15 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s10, 8 +; GFX803-NEXT: s_add_u32 s0, s0, s15 ; GFX803-NEXT: s_addc_u32 s1, s1, 0 +; GFX803-NEXT: s_mov_b32 s33, 0 ; GFX803-NEXT: v_mov_b32_e32 v0, 0 -; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 ; GFX803-NEXT: s_getpc_b64 s[4:5] ; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 ; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 ; GFX803-NEXT: s_movk_i32 s32, 0x400 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s11 ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -283,9 +283,9 @@ ; ; GFX900-LABEL: test_force_fp_kern_stack_and_call: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; GFX900-NEXT: s_add_u32 s0, s0, s7 +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX900-NEXT: s_add_u32 s0, s0, s15 ; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: s_mov_b32 s33, 0 ; GFX900-NEXT: v_mov_b32_e32 v0, 0 @@ -300,14 +300,14 @@ ; ; GFX1010-LABEL: test_force_fp_kern_stack_and_call: ; GFX1010: ; %bb.0: ; %entry -; GFX1010-NEXT: s_add_u32 s4, s4, s7 +; GFX1010-NEXT: s_add_u32 s10, s10, s15 ; GFX1010-NEXT: s_movk_i32 s32, 0x200 ; GFX1010-NEXT: s_mov_b32 s33, 0 -; GFX1010-NEXT: s_addc_u32 s5, s5, 0 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 -; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1010-NEXT: s_addc_u32 s11, s11, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11 ; GFX1010-NEXT: v_mov_b32_e32 v0, 0 -; GFX1010-NEXT: s_add_u32 s0, s0, s7 +; GFX1010-NEXT: s_add_u32 s0, s0, s15 ; GFX1010-NEXT: s_addc_u32 s1, s1, 0 ; GFX1010-NEXT: s_getpc_b64 s[4:5] ; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 Index: llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -168,10 +168,10 @@ define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 { ; GCN-LABEL: v3i16_registers: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_load_dword s4, s[8:9], 0x0 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s4, 1, s4 @@ -213,10 +213,10 @@ define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 { ; GCN-LABEL: v3f16_registers: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 -; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_load_dword s4, s[8:9], 0x0 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s4, 1, s4 Index: llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll +++ llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll @@ -3,21 +3,32 @@ ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-attributor < %s | FileCheck -check-prefixes=GCN,ATTRIBUTOR_GCN %s define internal void @indirect() { -; GCN-LABEL: define {{[^@]+}}@indirect -; GCN-SAME: () #[[ATTR0:[0-9]+]] { -; GCN-NEXT: ret void +; AKF_GCN-LABEL: define {{[^@]+}}@indirect() { +; AKF_GCN-NEXT: ret void +; +; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@indirect +; ATTRIBUTOR_GCN-SAME: () #[[ATTR0:[0-9]+]] { +; ATTRIBUTOR_GCN-NEXT: ret void ; ret void } define internal void @direct() { -; GCN-LABEL: define {{[^@]+}}@direct -; GCN-SAME: () #[[ATTR1:[0-9]+]] { -; GCN-NEXT: [[FPTR:%.*]] = alloca void ()*, align 8 -; GCN-NEXT: store void ()* @indirect, void ()** [[FPTR]], align 8 -; GCN-NEXT: [[FP:%.*]] = load void ()*, void ()** [[FPTR]], align 8 -; GCN-NEXT: call void [[FP]]() -; GCN-NEXT: ret void +; AKF_GCN-LABEL: define {{[^@]+}}@direct +; AKF_GCN-SAME: () #[[ATTR0:[0-9]+]] { +; AKF_GCN-NEXT: [[FPTR:%.*]] = alloca void ()*, align 8 +; AKF_GCN-NEXT: store void ()* @indirect, void ()** [[FPTR]], align 8 +; AKF_GCN-NEXT: [[FP:%.*]] = load void ()*, void ()** [[FPTR]], align 8 +; AKF_GCN-NEXT: call void [[FP]]() +; AKF_GCN-NEXT: ret void +; +; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@direct +; ATTRIBUTOR_GCN-SAME: () #[[ATTR1:[0-9]+]] { +; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca void ()*, align 8 +; ATTRIBUTOR_GCN-NEXT: store void ()* @indirect, void ()** [[FPTR]], align 8 +; ATTRIBUTOR_GCN-NEXT: [[FP:%.*]] = load void ()*, void ()** [[FPTR]], align 8 +; ATTRIBUTOR_GCN-NEXT: call void [[FP]]() +; ATTRIBUTOR_GCN-NEXT: ret void ; %fptr = alloca void()* store void()* @indirect, void()** %fptr @@ -27,23 +38,17 @@ } define amdgpu_kernel void @test_direct_indirect_call() { -; AKF_GCN-LABEL: define {{[^@]+}}@test_direct_indirect_call -; AKF_GCN-SAME: () #[[ATTR2:[0-9]+]] { -; AKF_GCN-NEXT: call void @direct() -; AKF_GCN-NEXT: ret void -; -; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@test_direct_indirect_call -; ATTRIBUTOR_GCN-SAME: () #[[ATTR1]] { -; ATTRIBUTOR_GCN-NEXT: call void @direct() -; ATTRIBUTOR_GCN-NEXT: ret void +; GCN-LABEL: define {{[^@]+}}@test_direct_indirect_call +; GCN-SAME: () #[[ATTR1:[0-9]+]] { +; GCN-NEXT: call void @direct() +; GCN-NEXT: ret void ; call void @direct() ret void } ;. -; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-queue-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } -; AKF_GCN: attributes #[[ATTR1]] = { "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-queue-ptr" "amdgpu-stack-objects" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "uniform-work-group-size"="false" } -; AKF_GCN: attributes #[[ATTR2]] = { "amdgpu-calls" "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-queue-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "uniform-work-group-size"="false" } +; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-stack-objects" "uniform-work-group-size"="false" } +; AKF_GCN: attributes #[[ATTR1]] = { "amdgpu-calls" "uniform-work-group-size"="false" } ;. ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" } Index: llvm/test/CodeGen/AMDGPU/ds_read2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -1344,13 +1344,13 @@ ; CI-NEXT: s_getpc_b64 s[40:41] ; CI-NEXT: s_mov_b32 s40, s0 ; CI-NEXT: s_load_dwordx4 s[40:43], s[40:41], 0x0 -; CI-NEXT: s_load_dwordx2 s[36:37], s[0:1], 0x9 -; CI-NEXT: s_load_dword s0, s[0:1], 0xb +; CI-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; CI-NEXT: s_load_dword s0, s[4:5], 0xb ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s40, s40, s3 +; CI-NEXT: s_add_u32 s40, s40, s11 ; CI-NEXT: s_addc_u32 s41, s41, 0 ; CI-NEXT: v_add_i32_e32 v40, vcc, s0, v0 ; CI-NEXT: s_getpc_b64 s[0:1] @@ -1376,17 +1376,17 @@ ; GFX9-NEXT: s_mov_b32 s36, s0 ; GFX9-NEXT: s_load_dwordx4 s[36:39], s[36:37], 0x0 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: v_mov_b32_e32 v40, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_add_u32 s36, s36, s11 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: v_lshl_add_u32 v41, v0, 2, s0 ; GFX9-NEXT: s_getpc_b64 s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, void_func_void@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s1, s1, void_func_void@gotpcrel32@hi+12 -; GFX9-NEXT: v_lshl_add_u32 v41, v0, 2, s2 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX9-NEXT: ds_read_b32 v42, v41 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] Index: llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll +++ llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll @@ -1,34 +1,35 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefixes=GCN,AKF_GCN %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-attributor %s | FileCheck -check-prefixes=GCN,ATTRIBUTOR_GCN %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=AKF_GCN %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-attributor %s | FileCheck -check-prefix=ATTRIBUTOR_GCN %s define internal void @indirect() { -; GCN-LABEL: define {{[^@]+}}@indirect -; GCN-SAME: () #[[ATTR0:[0-9]+]] { -; GCN-NEXT: ret void +; AKF_GCN-LABEL: define {{[^@]+}}@indirect() { +; AKF_GCN-NEXT: ret void +; +; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@indirect +; ATTRIBUTOR_GCN-SAME: () #[[ATTR0:[0-9]+]] { +; ATTRIBUTOR_GCN-NEXT: ret void ; -; CHECK-LABEL: define {{[^@]+}}@indirect -; CHECK-SAME: () #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: ret void ret void } define amdgpu_kernel void @test_simple_indirect_call() #0 { -; GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call -; GCN-SAME: () #[[ATTR1:[0-9]+]] { -; GCN-NEXT: [[FPTR:%.*]] = alloca void ()*, align 8 -; GCN-NEXT: store void ()* @indirect, void ()** [[FPTR]], align 8 -; GCN-NEXT: [[FP:%.*]] = load void ()*, void ()** [[FPTR]], align 8 -; GCN-NEXT: call void [[FP]]() -; GCN-NEXT: ret void +; AKF_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call +; AKF_GCN-SAME: () #[[ATTR0:[0-9]+]] { +; AKF_GCN-NEXT: [[FPTR:%.*]] = alloca void ()*, align 8 +; AKF_GCN-NEXT: store void ()* @indirect, void ()** [[FPTR]], align 8 +; AKF_GCN-NEXT: [[FP:%.*]] = load void ()*, void ()** [[FPTR]], align 8 +; AKF_GCN-NEXT: call void [[FP]]() +; AKF_GCN-NEXT: ret void +; +; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call +; ATTRIBUTOR_GCN-SAME: () #[[ATTR1:[0-9]+]] { +; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca void ()*, align 8 +; ATTRIBUTOR_GCN-NEXT: store void ()* @indirect, void ()** [[FPTR]], align 8 +; ATTRIBUTOR_GCN-NEXT: [[FP:%.*]] = load void ()*, void ()** [[FPTR]], align 8 +; ATTRIBUTOR_GCN-NEXT: call void [[FP]]() +; ATTRIBUTOR_GCN-NEXT: ret void ; -; CHECK-LABEL: define {{[^@]+}}@test_simple_indirect_call -; CHECK-SAME: () #[[ATTR1:[0-9]+]] { -; CHECK-NEXT: [[FPTR:%.*]] = alloca void ()*, align 8 -; CHECK-NEXT: store void ()* @indirect, void ()** [[FPTR]], align 8 -; CHECK-NEXT: [[FP:%.*]] = load void ()*, void ()** [[FPTR]], align 8 -; CHECK-NEXT: call void [[FP]]() -; CHECK-NEXT: ret void %fptr = alloca void()* store void()* @indirect, void()** %fptr %fp = load void()*, void()** %fptr @@ -36,13 +37,11 @@ ret void } -; FIXME: Switch this to no-dispatch-id -attributes #0 = { "amdgpu-dispatch-id" } +attributes #0 = { "amdgpu-no-dispatch-id" } ;. -; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-queue-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } -; AKF_GCN: attributes #[[ATTR1]] = { "amdgpu-calls" "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-queue-ptr" "amdgpu-stack-objects" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } +; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-no-dispatch-id" "amdgpu-stack-objects" } ;. ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-dispatch-id" "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" } ;. Index: llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll +++ llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll @@ -46,11 +46,11 @@ define amdgpu_kernel void @kernel_calls_no_stack() { ; GCN-LABEL: kernel_calls_no_stack: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 s0, s0, s3 +; GCN-NEXT: s_add_u32 s6, s6, s11 ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 -; GCN-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 +; GCN-NEXT: s_addc_u32 s7, s7, 0 +; GCN-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6 +; GCN-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7 ; GCN-NEXT: s_getpc_b64 s[0:1] ; GCN-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12 Index: llvm/test/CodeGen/AMDGPU/indirect-call.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -4,7 +4,7 @@ @gv.fptr0 = external hidden unnamed_addr addrspace(4) constant void()*, align 4 @gv.fptr1 = external hidden unnamed_addr addrspace(4) constant void(i32)*, align 4 -define amdgpu_kernel void @test_indirect_call_sgpr_ptr() { +define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) { ; GCN-LABEL: test_indirect_call_sgpr_ptr: ; GCN: .amd_kernel_code_t ; GCN-NEXT: amd_code_version_major = 1 @@ -57,7 +57,7 @@ ; GCN-NEXT: workitem_private_segment_byte_size = 16384 ; GCN-NEXT: workgroup_group_segment_byte_size = 0 ; GCN-NEXT: gds_segment_byte_size = 0 -; GCN-NEXT: kernarg_segment_byte_size = 0 +; GCN-NEXT: kernarg_segment_byte_size = 4 ; GCN-NEXT: workgroup_fbarrier_count = 0 ; GCN-NEXT: wavefront_sgpr_count = 37 ; GCN-NEXT: workitem_vgpr_count = 32 @@ -86,8 +86,10 @@ ; GCN-NEXT: s_getpc_b64 s[14:15] ; GCN-NEXT: s_add_u32 s14, s14, gv.fptr0@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s15, s15, gv.fptr0@rel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 +; GCN-NEXT: s_add_u32 s8, s8, 8 +; GCN-NEXT: s_addc_u32 s9, s9, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: v_or_b32_e32 v31, v0, v2 @@ -100,7 +102,7 @@ ret void } -define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg() { +define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) { ; GCN-LABEL: test_indirect_call_sgpr_ptr_arg: ; GCN: .amd_kernel_code_t ; GCN-NEXT: amd_code_version_major = 1 @@ -153,7 +155,7 @@ ; GCN-NEXT: workitem_private_segment_byte_size = 16384 ; GCN-NEXT: workgroup_group_segment_byte_size = 0 ; GCN-NEXT: gds_segment_byte_size = 0 -; GCN-NEXT: kernarg_segment_byte_size = 0 +; GCN-NEXT: kernarg_segment_byte_size = 4 ; GCN-NEXT: workgroup_fbarrier_count = 0 ; GCN-NEXT: wavefront_sgpr_count = 37 ; GCN-NEXT: workitem_vgpr_count = 32 @@ -183,8 +185,10 @@ ; GCN-NEXT: s_add_u32 s14, s14, gv.fptr1@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s15, s15, gv.fptr1@rel32@hi+12 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 +; GCN-NEXT: s_add_u32 s8, s8, 8 +; GCN-NEXT: s_addc_u32 s9, s9, 0 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: v_or_b32_e32 v31, v0, v2 ; GCN-NEXT: v_mov_b32_e32 v0, 0x7b Index: llvm/test/CodeGen/AMDGPU/llc-pipeline.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -50,6 +50,7 @@ ; GCN-O0-NEXT: Expand vector predication intrinsics ; GCN-O0-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O0-NEXT: Expand reduction intrinsics +; GCN-O0-NEXT: AMDGPU Attributor ; GCN-O0-NEXT: CallGraph Construction ; GCN-O0-NEXT: Call Graph SCC Pass Manager ; GCN-O0-NEXT: AMDGPU Annotate Kernel Features @@ -214,6 +215,7 @@ ; GCN-O1-NEXT: Expand vector predication intrinsics ; GCN-O1-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O1-NEXT: Expand reduction intrinsics +; GCN-O1-NEXT: AMDGPU Attributor ; GCN-O1-NEXT: CallGraph Construction ; GCN-O1-NEXT: Call Graph SCC Pass Manager ; GCN-O1-NEXT: AMDGPU Annotate Kernel Features @@ -484,6 +486,7 @@ ; GCN-O1-OPTS-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O1-OPTS-NEXT: Expand reduction intrinsics ; GCN-O1-OPTS-NEXT: Early CSE +; GCN-O1-OPTS-NEXT: AMDGPU Attributor ; GCN-O1-OPTS-NEXT: CallGraph Construction ; GCN-O1-OPTS-NEXT: Call Graph SCC Pass Manager ; GCN-O1-OPTS-NEXT: AMDGPU Annotate Kernel Features @@ -768,6 +771,7 @@ ; GCN-O2-NEXT: Scalarize Masked Memory Intrinsics ; GCN-O2-NEXT: Expand reduction intrinsics ; GCN-O2-NEXT: Early CSE +; GCN-O2-NEXT: AMDGPU Attributor ; GCN-O2-NEXT: CallGraph Construction ; GCN-O2-NEXT: Call Graph SCC Pass Manager ; GCN-O2-NEXT: AMDGPU Annotate Kernel Features @@ -1067,6 +1071,7 @@ ; GCN-O3-NEXT: Lazy Block Frequency Analysis ; GCN-O3-NEXT: Optimization Remark Emitter ; GCN-O3-NEXT: Global Value Numbering +; GCN-O3-NEXT: AMDGPU Attributor ; GCN-O3-NEXT: CallGraph Construction ; GCN-O3-NEXT: Call Graph SCC Pass Manager ; GCN-O3-NEXT: AMDGPU Annotate Kernel Features Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll @@ -2,12 +2,16 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MESA %s ; GCN-LABEL: {{^}}kernel_implicitarg_ptr_empty: -; GCN: enable_sgpr_kernarg_segment_ptr = 1 - +; HSA: enable_sgpr_kernarg_segment_ptr = 0 ; HSA: kernarg_segment_byte_size = 0 + +; MESA: enable_sgpr_kernarg_segment_ptr = 1 ; MESA: kernarg_segment_byte_size = 16 -; HSA: s_load_dword s0, s[4:5], 0x0 +; HSA: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0{{$}} +; HSA: s_load_dword s0, [[NULL]], 0x0 + +; MESA: s_load_dword s0, s[4:5], 0x0 define amdgpu_kernel void @kernel_implicitarg_ptr_empty() #0 { %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* @@ -82,12 +86,16 @@ } ; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func_empty: -; GCN: enable_sgpr_kernarg_segment_ptr = 1 +; HSA: enable_sgpr_kernarg_segment_ptr = 0 ; HSA: kernarg_segment_byte_size = 0 + +; MESA: enable_sgpr_kernarg_segment_ptr = 1 ; MESA: kernarg_segment_byte_size = 16 -; GCN-NOT: s[4:5] -; GCN-NOT: s4 -; GCN-NOT: s5 + +; HSA: s_mov_b64 s[4:5], 0{{$}} +; MESA-NOT: s[4:5] +; MESA-NOT: s4 +; MESA-NOT: s5 ; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_implicitarg_ptr_func_empty() #0 { call void @func_implicitarg_ptr() Index: llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll +++ llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -36,7 +36,7 @@ } ;. -; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-queue-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } +; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" } ;. ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "uniform-work-group-size"="false" } ;. Index: llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -8,12 +8,12 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <4 x i32> addrspace(1)* %input, <4 x float> addrspace(1)* %output, i32 %i) { ; MUBUF-LABEL: kernel_background_evaluate: ; MUBUF: ; %bb.0: ; %entry -; MUBUF-NEXT: s_load_dword s0, s[0:1], 0x24 +; MUBUF-NEXT: s_load_dword s0, s[4:5], 0x24 ; MUBUF-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 ; MUBUF-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; MUBUF-NEXT: s_mov_b32 s38, -1 ; MUBUF-NEXT: s_mov_b32 s39, 0x31c16000 -; MUBUF-NEXT: s_add_u32 s36, s36, s3 +; MUBUF-NEXT: s_add_u32 s36, s36, s11 ; MUBUF-NEXT: s_addc_u32 s37, s37, 0 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x2000 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 @@ -46,12 +46,12 @@ ; ; FLATSCR-LABEL: kernel_background_evaluate: ; FLATSCR: ; %bb.0: ; %entry -; FLATSCR-NEXT: s_add_u32 s2, s2, s5 +; FLATSCR-NEXT: s_add_u32 s8, s8, s13 ; FLATSCR-NEXT: s_movk_i32 s32, 0x6000 -; FLATSCR-NEXT: s_addc_u32 s3, s3, 0 -; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; FLATSCR-NEXT: s_load_dword s2, s[0:1], 0x24 +; FLATSCR-NEXT: s_addc_u32 s9, s9, 0 +; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 +; FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; FLATSCR-NEXT: s_load_dword s2, s[4:5], 0x24 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0x2000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0x4000 ; FLATSCR-NEXT: v_mov_b32_e32 v3, 0