Index: llvm/trunk/lib/Target/AMDGPU/AMDGPU.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPU.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.h @@ -55,7 +55,7 @@ void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); extern char &AMDGPUAnnotateKernelFeaturesID; -ModulePass *createAMDGPULowerIntrinsicsPass(); +ModulePass *createAMDGPULowerIntrinsicsPass(const TargetMachine *TM = nullptr); void initializeAMDGPULowerIntrinsicsPass(PassRegistry &); extern char &AMDGPULowerIntrinsicsID; Index: llvm/trunk/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -23,10 +24,16 @@ const unsigned MaxStaticSize = 1024; class AMDGPULowerIntrinsics : public ModulePass { +private: + const TargetMachine *TM; + + bool makeLIDRangeMetadata(Function &F) const; + public: static char ID; - AMDGPULowerIntrinsics() : ModulePass(ID) { } + AMDGPULowerIntrinsics(const TargetMachine *TM = nullptr) + : ModulePass(ID), TM(TM) { } bool runOnModule(Module &M) override; StringRef getPassName() const override { return "AMDGPU Lower Intrinsics"; @@ -39,8 +46,8 @@ char &llvm::AMDGPULowerIntrinsicsID = AMDGPULowerIntrinsics::ID; -INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE, - "Lower intrinsics", false, false) +INITIALIZE_TM_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE, + "Lower intrinsics", false, false) // TODO: Should refine based on estimated number of accesses (e.g. does it // require splitting based on alignment) @@ -96,6 +103,23 @@ return Changed; } +bool AMDGPULowerIntrinsics::makeLIDRangeMetadata(Function &F) const { + if (!TM) + return false; + + bool Changed = false; + const AMDGPUSubtarget &ST = TM->getSubtarget(F); + + for (auto *U : F.users()) { + auto *CI = dyn_cast(U); + if (!CI) + continue; + + Changed |= ST.makeLIDRangeMetadata(CI); + } + return Changed; +} + bool AMDGPULowerIntrinsics::runOnModule(Module &M) { bool Changed = false; @@ -110,6 +134,19 @@ if (expandMemIntrinsicUses(F)) Changed = true; break; + + case Intrinsic::amdgcn_workitem_id_x: + case Intrinsic::r600_read_tidig_x: + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::r600_read_tidig_y: + case Intrinsic::amdgcn_workitem_id_z: + case Intrinsic::r600_read_tidig_z: + case Intrinsic::r600_read_local_size_x: + case Intrinsic::r600_read_local_size_y: + case Intrinsic::r600_read_local_size_z: + Changed |= makeLIDRangeMetadata(F); + break; + default: break; } @@ -118,6 +155,6 @@ return Changed; } -ModulePass *llvm::createAMDGPULowerIntrinsicsPass() { - return new AMDGPULowerIntrinsics(); +ModulePass *llvm::createAMDGPULowerIntrinsicsPass(const TargetMachine *TM) { + return new AMDGPULowerIntrinsics(TM); } Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -38,7 +38,6 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/LLVMContext.h" -#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" @@ -71,7 +70,6 @@ const TargetMachine *TM; Module *Mod = nullptr; const DataLayout *DL = nullptr; - MDNode *MaxWorkGroupSizeRange = nullptr; AMDGPUAS AS; // FIXME: This should be per-kernel. @@ -133,13 +131,6 @@ Mod = &M; DL = &Mod->getDataLayout(); - // The maximum workitem id. - // - // FIXME: Should get as subtarget property. Usually runtime enforced max is - // 256. - MDBuilder MDB(Mod->getContext()); - MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048)); - const Triple &TT = TM->getTargetTriple(); IsAMDGCN = TT.getArch() == Triple::amdgcn; @@ -258,6 +249,9 @@ std::pair AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { + const AMDGPUSubtarget &ST = TM->getSubtarget( + *Builder.GetInsertBlock()->getParent()); + if (!IsAMDHSA) { Function *LocalSizeYFn = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y); @@ -267,8 +261,8 @@ CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {}); CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {}); - LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); - LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); + ST.makeLIDRangeMetadata(LocalSizeY); + ST.makeLIDRangeMetadata(LocalSizeZ); return std::make_pair(LocalSizeY, LocalSizeZ); } @@ -333,7 +327,7 @@ MDNode *MD = MDNode::get(Mod->getContext(), None); LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD); LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD); - LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); + ST.makeLIDRangeMetadata(LoadZU); // Extract y component. Upper half of LoadZU should be zero already. Value *Y = Builder.CreateLShr(LoadXY, 16); @@ -342,6 +336,8 @@ } Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) { + const AMDGPUSubtarget &ST = TM->getSubtarget( + *Builder.GetInsertBlock()->getParent()); Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic; switch (N) { @@ -364,7 +360,7 @@ Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID); CallInst *CI = Builder.CreateCall(WorkitemIdFn); - CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); + ST.makeLIDRangeMetadata(CI); return CI; } @@ -690,8 +686,6 @@ const AMDGPUSubtarget &ST = TM->getSubtarget(ContainingFunction); - // FIXME: We should also try to get this value from the reqd_work_group_size - // function attribute if it is available. unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second; const DataLayout &DL = Mod->getDataLayout(); Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -512,6 +512,9 @@ /// compatible with minimum/maximum number of waves limited by flat work group /// size, register usage, and/or lds usage. std::pair getWavesPerEU(const Function &F) const; + + /// Creates value range metadata on an workitemid.* inrinsic call or load. + bool makeLIDRangeMetadata(Instruction *I) const; }; class R600Subtarget final : public AMDGPUSubtarget { Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -16,6 +16,7 @@ #include "SIMachineFunctionInfo.h" #include "llvm/ADT/SmallString.h" #include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/Target/TargetFrameLowering.h" #include @@ -240,6 +241,65 @@ return Requested; } +bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { + Function *Kernel = I->getParent()->getParent(); + unsigned MinSize = 0; + unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; + bool IdQuery = false; + + // If reqd_work_group_size is present it narrows value down. + if (auto *CI = dyn_cast(I)) { + const Function *F = CI->getCalledFunction(); + if (F) { + unsigned Dim = UINT_MAX; + switch (F->getIntrinsicID()) { + case Intrinsic::amdgcn_workitem_id_x: + case Intrinsic::r600_read_tidig_x: + IdQuery = true; + case Intrinsic::r600_read_local_size_x: + Dim = 0; + break; + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::r600_read_tidig_y: + IdQuery = true; + case Intrinsic::r600_read_local_size_y: + Dim = 1; + break; + case Intrinsic::amdgcn_workitem_id_z: + case Intrinsic::r600_read_tidig_z: + IdQuery = true; + case Intrinsic::r600_read_local_size_z: + Dim = 2; + break; + default: + break; + } + if (Dim <= 3) { + if (auto Node = Kernel->getMetadata("reqd_work_group_size")) + if (Node->getNumOperands() == 3) + MinSize = MaxSize = mdconst::extract( + Node->getOperand(Dim))->getZExtValue(); + } + } + } + + if (!MaxSize) + return false; + + // Range metadata is [Lo, Hi). For ID query we need to pass max size + // as Hi. For size query we need to pass Hi + 1. + if (IdQuery) + MinSize = 0; + else + ++MaxSize; + + MDBuilder MDB(I->getContext()); + MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize), + APInt(32, MaxSize)); + I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); + return true; +} + R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, const TargetMachine &TM) : AMDGPUSubtarget(TT, GPU, FS, TM), Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -555,12 +555,14 @@ } void AMDGPUPassConfig::addIRPasses() { + const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); + // There is no reason to run these. disablePass(&StackMapLivenessID); disablePass(&FuncletLayoutID); disablePass(&PatchableFunctionID); - addPass(createAMDGPULowerIntrinsicsPass()); + addPass(createAMDGPULowerIntrinsicsPass(&TM)); // Function calls are not supported, so make sure we inline everything. addPass(createAMDGPUAlwaysInlinePass()); @@ -572,8 +574,6 @@ // without ever running any passes on the second. addPass(createBarrierNoopPass()); - const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); - if (TM.getTargetTriple().getArch() == Triple::amdgcn) { // TODO: May want to move later or split into an early and late one. Index: llvm/trunk/test/CodeGen/AMDGPU/add.i16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/add.i16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/add.i16.ll @@ -84,10 +84,10 @@ ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i64: +; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] ; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]] -; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/trunk/test/CodeGen/AMDGPU/amdgpu.private-memory.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ llvm/trunk/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -27,8 +27,6 @@ ; HSA-PROMOTE: workgroup_group_segment_byte_size = 5120 ; HSA-PROMOTE: .end_amd_kernel_code_t -; FIXME: These should be merged -; HSA-PROMOTE: s_load_dword s{{[0-9]+}}, s[4:5], 0x1 ; HSA-PROMOTE: s_load_dword s{{[0-9]+}}, s[4:5], 0x2 ; SI-PROMOTE: ds_write_b32 @@ -58,9 +56,9 @@ ; HSAOPT: [[LDZU:%[0-9]+]] = load i32, i32 addrspace(2)* [[GEP1]], align 4, !range !1, !invariant.load !0 ; HSAOPT: [[EXTRACTY:%[0-9]+]] = lshr i32 [[LDXY]], 16 -; HSAOPT: [[WORKITEM_ID_X:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.x(), !range !1 -; HSAOPT: [[WORKITEM_ID_Y:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.y(), !range !1 -; HSAOPT: [[WORKITEM_ID_Z:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.z(), !range !1 +; HSAOPT: [[WORKITEM_ID_X:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.x(), !range !2 +; HSAOPT: [[WORKITEM_ID_Y:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.y(), !range !2 +; HSAOPT: [[WORKITEM_ID_Z:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.z(), !range !2 ; HSAOPT: [[Y_SIZE_X_Z_SIZE:%[0-9]+]] = mul nuw nsw i32 [[EXTRACTY]], [[LDZU]] ; HSAOPT: [[YZ_X_XID:%[0-9]+]] = mul i32 [[Y_SIZE_X_Z_SIZE]], [[WORKITEM_ID_X]] @@ -77,9 +75,9 @@ ; NOHSAOPT: call i32 @llvm.r600.read.local.size.y(), !range !0 ; NOHSAOPT: call i32 @llvm.r600.read.local.size.z(), !range !0 -; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !0 -; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !0 -; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !0 +; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !1 +; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !1 +; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !1 define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: %stack = alloca [5 x i32], align 4 @@ -557,6 +555,8 @@ attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" } ; HSAOPT: !0 = !{} -; HSAOPT: !1 = !{i32 0, i32 2048} +; HSAOPT: !1 = !{i32 0, i32 257} +; HSAOPT: !2 = !{i32 0, i32 256} -; NOHSAOPT: !0 = !{i32 0, i32 2048} +; NOHSAOPT: !0 = !{i32 0, i32 257} +; NOHSAOPT: !1 = !{i32 0, i32 256} Index: llvm/trunk/test/CodeGen/AMDGPU/bfe-patterns.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/bfe-patterns.ll +++ llvm/trunk/test/CodeGen/AMDGPU/bfe-patterns.ll @@ -50,7 +50,7 @@ ; GCN-LABEL: {{^}}s_ubfe_sub_i32: ; GCN: s_load_dword [[SRC:s[0-9]+]] ; GCN: s_load_dword [[WIDTH:s[0-9]+]] -; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]] +; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], {{s[0-9]+}} ; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]] define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -128,7 +128,7 @@ ; GCN-LABEL: {{^}}s_sbfe_sub_i32: ; GCN: s_load_dword [[SRC:s[0-9]+]] ; GCN: s_load_dword [[WIDTH:s[0-9]+]] -; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]] +; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], {{s[0-9]+}} ; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]] define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/trunk/test/CodeGen/AMDGPU/ds_read2_superreg.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/ds_read2_superreg.ll +++ llvm/trunk/test/CodeGen/AMDGPU/ds_read2_superreg.ll @@ -150,7 +150,7 @@ ; Do scalar loads into the super register we need. ; CI-LABEL: {{^}}simple_read2_v2f32_superreg_scalar_loads_align4: ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT0:[0-9]+]]:[[REG_ELT1:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} -; CI-NOT: v_mov +; CI-NOT: v_mov {{v[0-9]+}}, {{[sv][0-9]+}} ; CI: buffer_store_dwordx2 v{{\[}}[[REG_ELT0]]:[[REG_ELT1]]{{\]}} ; CI: s_endpgm define amdgpu_kernel void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspace(1)* %out) #0 { @@ -173,7 +173,7 @@ ; CI-LABEL: {{^}}simple_read2_v4f32_superreg_scalar_loads_align4: ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT0:[0-9]+]]:[[REG_ELT1:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT2:[0-9]+]]:[[REG_ELT3:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} -; CI-NOT: v_mov +; CI-NOT: v_mov {{v[0-9]+}}, {{[sv][0-9]+}} ; CI: buffer_store_dwordx4 v{{\[}}[[REG_ELT0]]:[[REG_ELT3]]{{\]}} ; CI: s_endpgm define amdgpu_kernel void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspace(1)* %out) #0 { Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll @@ -234,8 +234,8 @@ } ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i64_offset_addr64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -248,8 +248,8 @@ } ; GCN-LABEL: {{^}}flat_atomic_dec_noret_i64_offset_addr64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}} define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -355,8 +355,8 @@ } ; GCN-LABEL: {{^}}global_atomic_dec_ret_i64_offset_addr64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}} ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { @@ -370,8 +370,8 @@ } ; GCN-LABEL: {{^}}global_atomic_dec_noret_i64_offset_addr64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}} ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 { Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll @@ -206,8 +206,8 @@ } ; GCN-LABEL: {{^}}global_atomic_inc_ret_i64_offset_addr64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}} ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { @@ -221,8 +221,8 @@ } ; GCN-LABEL: {{^}}global_atomic_inc_noret_i64_offset_addr64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}} ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 { @@ -348,8 +348,8 @@ } ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i64_offset_addr64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -362,8 +362,8 @@ } ; GCN-LABEL: {{^}}flat_atomic_inc_noret_i64_offset_addr64: -; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 -; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}} define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/trunk/test/CodeGen/AMDGPU/local-memory.amdgcn.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/local-memory.amdgcn.ll +++ llvm/trunk/test/CodeGen/AMDGPU/local-memory.amdgcn.ll @@ -45,11 +45,7 @@ ; GCN-LABEL: {{^}}local_memory_two_objects: ; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0 ; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4 - -; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]] - -; SI-DAG: ds_write_b32 [[ADDRW]], -; SI-DAG: ds_write_b32 [[ADDRW_OFF]], +; SI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4 ; GCN: s_barrier Index: llvm/trunk/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll +++ llvm/trunk/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll @@ -14,8 +14,8 @@ } ; CHECK-LABEL: {{^}}test_workitem_id_x_known_trunc_1_bit_range: -; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x1ff, v0 -; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]] +; CHECK-NOT: v_and_b32 +; CHECK: {{flat|buffer}}_store_dword {{.*}}v0 define amdgpu_kernel void @test_workitem_id_x_known_trunc_1_bit_range(i32 addrspace(1)* nocapture %out) #0 { entry: %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0 @@ -26,8 +26,8 @@ ; CHECK-LABEL: {{^}}test_workitem_id_x_known_max_range_m1: ; CHECK-NOT: v0 -; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xff, v0 -; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]] +; CHECK-NOT: v_and_b32 +; CHECK: {{flat|buffer}}_store_dword {{.*}}v0 define amdgpu_kernel void @test_workitem_id_x_known_max_range_m1(i32 addrspace(1)* nocapture %out) #0 { entry: %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !1 Index: llvm/trunk/test/CodeGen/AMDGPU/private-memory-r600.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/private-memory-r600.ll +++ llvm/trunk/test/CodeGen/AMDGPU/private-memory-r600.ll @@ -12,9 +12,9 @@ ; OPT: call i32 @llvm.r600.read.local.size.y(), !range !0 ; OPT: call i32 @llvm.r600.read.local.size.z(), !range !0 -; OPT: call i32 @llvm.r600.read.tidig.x(), !range !0 -; OPT: call i32 @llvm.r600.read.tidig.y(), !range !0 -; OPT: call i32 @llvm.r600.read.tidig.z(), !range !0 +; OPT: call i32 @llvm.r600.read.tidig.x(), !range !1 +; OPT: call i32 @llvm.r600.read.tidig.y(), !range !1 +; OPT: call i32 @llvm.r600.read.tidig.z(), !range !1 define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: @@ -295,6 +295,7 @@ ret void } -; OPT: !0 = !{i32 0, i32 2048} +; OPT: !0 = !{i32 0, i32 257} +; OPT: !1 = !{i32 0, i32 256} attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" } Index: llvm/trunk/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll +++ llvm/trunk/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll @@ -12,7 +12,7 @@ ; GCN: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm define amdgpu_kernel void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { - %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x %ld.64 = load i128, i128 addrspace(1)* %in.gep @@ -56,7 +56,7 @@ ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm define amdgpu_kernel void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { - %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x %ld.64 = load i128, i128 addrspace(1)* %in.gep @@ -113,5 +113,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare i32 @llvm.amdgcn.workgroup.id.x() #0 + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } Index: llvm/trunk/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll +++ llvm/trunk/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll @@ -9,7 +9,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}} define amdgpu_kernel void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x %ld.64 = load i64, i64 addrspace(1)* %in.gep @@ -42,7 +42,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} define amdgpu_kernel void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x %ld.64 = load i64, i64 addrspace(1)* %in.gep @@ -58,7 +58,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} define amdgpu_kernel void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x %ld.64 = load i64, i64 addrspace(1)* %in.gep @@ -106,7 +106,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} define amdgpu_kernel void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x %ld.64 = load i64, i64 addrspace(1)* %in.gep @@ -122,7 +122,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} define amdgpu_kernel void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x %ld.64 = load i64, i64 addrspace(1)* %in.gep @@ -138,7 +138,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}} define amdgpu_kernel void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x %ld.64 = load i64, i64 addrspace(1)* %in.gep @@ -156,7 +156,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}} define amdgpu_kernel void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { - %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x %ld.64 = load i64, i64 addrspace(1)* %in.gep @@ -383,5 +383,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare i32 @llvm.amdgcn.workgroup.id.x() #0 + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } Index: llvm/trunk/test/CodeGen/AMDGPU/shl.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/shl.ll +++ llvm/trunk/test/CodeGen/AMDGPU/shl.ll @@ -4,6 +4,8 @@ declare i32 @llvm.r600.read.tidig.x() #0 +declare i32 @llvm.r600.read.tgid.x() #0 + ;EG: {{^}}shl_v2i32: ;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} @@ -288,7 +290,7 @@ ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[LO_A]]{{\]}} define amdgpu_kernel void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { - %tid = call i32 @llvm.r600.read.tidig.x() #0 + %tid = call i32 @llvm.r600.read.tgid.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid %a = load i64, i64 addrspace(1)* %gep.in Index: llvm/trunk/test/CodeGen/AMDGPU/sub.i16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/sub.i16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/sub.i16.ll @@ -85,10 +85,10 @@ ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i64: +; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] ; VI-DAG: v_subrev_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]] -; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/trunk/test/CodeGen/AMDGPU/zext-lid.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/zext-lid.ll +++ llvm/trunk/test/CodeGen/AMDGPU/zext-lid.ll @@ -0,0 +1,83 @@ +; RUN: llc -march=amdgcn < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-intrinsics < %s | FileCheck -check-prefix=OPT %s + +; CHECK-NOT: and_b32 + +; OPT-LABEL: @zext_grp_size_128 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !0 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.y() #2, !range !0 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.z() #2, !range !0 +define amdgpu_kernel void @zext_grp_size_128(i32 addrspace(1)* nocapture %arg) #0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2 + %tmp1 = and i32 %tmp, 127 + store i32 %tmp1, i32 addrspace(1)* %arg, align 4 + %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2 + %tmp3 = and i32 %tmp2, 127 + %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4 + %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2 + %tmp6 = and i32 %tmp5, 127 + %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4 + ret void +} + +; OPT-LABEL: @zext_grp_size_32x4x1 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !2 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.y() #2, !range !3 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.z() #2, !range !4 +define amdgpu_kernel void @zext_grp_size_32x4x1(i32 addrspace(1)* nocapture %arg) #0 !reqd_work_group_size !0 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2 + %tmp1 = and i32 %tmp, 31 + store i32 %tmp1, i32 addrspace(1)* %arg, align 4 + %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2 + %tmp3 = and i32 %tmp2, 3 + %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4 + %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2 + %tmp6 = and i32 %tmp5, 1 + %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4 + ret void +} + +; OPT-LABEL: @zext_grp_size_512 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !5 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.y() #2, !range !5 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.z() #2, !range !5 +define amdgpu_kernel void @zext_grp_size_512(i32 addrspace(1)* nocapture %arg) #1 { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2 + %tmp1 = and i32 %tmp, 65535 + store i32 %tmp1, i32 addrspace(1)* %arg, align 4 + %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2 + %tmp3 = and i32 %tmp2, 65535 + %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4 + %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2 + %tmp6 = and i32 %tmp5, 65535 + %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #2 + +declare i32 @llvm.amdgcn.workitem.id.y() #2 + +declare i32 @llvm.amdgcn.workitem.id.z() #2 + +attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,128" } +attributes #1 = { nounwind "amdgpu-flat-work-group-size"="512,512" } +attributes #2 = { nounwind readnone } + +!0 = !{i32 32, i32 4, i32 1} + +; OPT: !0 = !{i32 0, i32 128} +; OPT: !1 = !{i32 32, i32 4, i32 1} +; OPT: !2 = !{i32 0, i32 32} +; OPT: !3 = !{i32 0, i32 4} +; OPT: !4 = !{i32 0, i32 1} +; OPT: !5 = !{i32 0, i32 512}