Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -16236,10 +16236,22 @@ return CGF.Builder.CreateAddrSpaceCast(Call, RetTy); } +Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) { + auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_implicitarg_ptr); + auto *Call = CGF.Builder.CreateCall(F); + Call->addRetAttr( + Attribute::getWithDereferenceableBytes(Call->getContext(), 64)); + Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(4))); + return Call; +} + // \p Index is 0, 1, and 2 for x, y, and z dimension, respectively. Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) { - const unsigned XOffset = 4; - auto *DP = EmitAMDGPUDispatchPtr(CGF); + bool IsCOV_5 = CGF.getTarget().getTargetOpts().CodeObjectVersion == + clang::TargetOptions::COV_5; + const unsigned XOffset = IsCOV_5 ? 12 : 4; + auto *DP = IsCOV_5 ? EmitAMDGPUDispatchPtr(CGF) : + EmitAMDGPUImplicitArgPtr(CGF); // Indexing the HSA kernel_dispatch_packet struct. auto *Offset = llvm::ConstantInt::get(CGF.Int32Ty, XOffset + Index * 2); auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset); Index: clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu =================================================================== --- clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu +++ clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu @@ -1,17 +1,31 @@ // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \ // RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \ -// RUN: | FileCheck %s +// RUN: | FileCheck -check-prefix=PRECOV5 %s + + +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \ +// RUN: -fcuda-is-device -mcode-object-version=5 -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefix=COV5 %s #include "Inputs/cuda.h" -// CHECK-LABEL: test_get_workgroup_size -// CHECK: call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() -// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 4 -// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load -// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 6 -// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load -// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 8 -// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load +// PRECOV5-LABEL: test_get_workgroup_size +// PRECOV5: call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() +// PRECOV5: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 4 +// PRECOV5: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load +// PRECOV5: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 6 +// PRECOV5: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load +// PRECOV5: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 8 +// PRECOV5: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load + +// COV5-LABEL: test_get_workgroup_size +// COV5: call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() +// COV5: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 12 +// COV5: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load +// COV5: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 14 +// COV5: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load +// COV5: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 16 +// COV5: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load __device__ void test_get_workgroup_size(int d, int *out) { switch (d) { @@ -22,4 +36,4 @@ } } -// CHECK-DAG: [[$WS_RANGE]] = !{i16 1, i16 1025} +// PRECOV5-DAG: [[$WS_RANGE]] = !{i16 1, i16 1025} Index: clang/test/CodeGenOpenCL/builtins-amdgcn-workgroup-size-v5.cl =================================================================== --- /dev/null +++ clang/test/CodeGenOpenCL/builtins-amdgcn-workgroup-size-v5.cl @@ -0,0 +1,23 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -S -emit-llvm -mcode-object-version=5 -o - %s | FileCheck -enable-var-scope %s + + +// CHECK-LABEL: @test_get_workgroup_size( +// CHECK: call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() +// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 12 +// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 4, !range [[$WS_RANGE:![0-9]*]], !invariant.load +// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 14 +// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load +// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i64 16 +// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 4, !range [[$WS_RANGE:![0-9]*]], !invariant.load +void test_get_workgroup_size(int d, global int *out) +{ + switch (d) { + case 0: *out = __builtin_amdgcn_workgroup_size_x() + 1; break; + case 1: *out = __builtin_amdgcn_workgroup_size_y(); break; + case 2: *out = __builtin_amdgcn_workgroup_size_z(); break; + default: *out = 0; + } +} + +// CHECK-DAG: [[$WS_RANGE]] = !{i16 1, i16 1025} Index: llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -407,6 +407,9 @@ if (NeedsQueuePtr) { removeAssumedBits(QUEUE_PTR); + // We are going to use the implicit kernarg for V5. + if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) + removeAssumedBits(IMPLICIT_ARG_PTR); } if (funcRetrievesHostcallPtr(A)) { Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -320,8 +320,12 @@ enum ImplicitParameter { FIRST_IMPLICIT, + // Didn't find any usage of GRID_DIM and GRID_OFFSET, drop them? GRID_DIM = FIRST_IMPLICIT, GRID_OFFSET, + PRIVATE_BASE, + SHARED_BASE, + QUEUE_PTR, }; /// Helper function that returns the byte offset of the given Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4385,6 +4385,14 @@ return ArgOffset; case GRID_OFFSET: return ArgOffset + 4; + // TODO: if the offset changes with code pbject version, we should include + // code object version in the calculation. + case PRIVATE_BASE: + return ArgOffset + 192; + case SHARED_BASE: + return ArgOffset + 196; + case QUEUE_PTR: + return ArgOffset + 200; } llvm_unreachable("unexpected implicit parameter type"); } Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1808,6 +1808,37 @@ return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); } + // TODO: can we be smarter about machine pointer info? + MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); + // For code object version 5, private_base and shared_base are passed through + // implicit kernargs. + if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { + AMDGPUTargetLowering::ImplicitParameter Param = + (AS == AMDGPUAS::LOCAL_ADDRESS) ? AMDGPUTargetLowering::SHARED_BASE : + AMDGPUTargetLowering::PRIVATE_BASE; + uint64_t Offset = + ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); + + Register KernargPtrReg = MRI.createGenericVirtualRegister( + LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); + + if (!loadInputValue(KernargPtrReg, B, + AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) + return Register(); + + MachineMemOperand *MMO = MF.getMachineMemOperand( + PtrInfo, + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + LLT::scalar(32), commonAlignment(Align(64), Offset)); + + Register LoadAddr; + // Pointer address + B.materializePtrAdd(LoadAddr, KernargPtrReg, LLT::scalar(64), Offset); + // Load address + return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); + } + Register QueuePtr = MRI.createGenericVirtualRegister( LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); @@ -1818,8 +1849,6 @@ // private_segment_aperture_base_hi. uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; - // TODO: can we be smarter about machine pointer info? - MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); MachineMemOperand *MMO = MF.getMachineMemOperand( PtrInfo, MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | @@ -4815,6 +4844,46 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { + MachineFunction &MF = B.getMF(); + const GCNSubtarget &ST = MF.getSubtarget(); + const LLT S64 = LLT::scalar(64); + + Register SGPR01(AMDGPU::SGPR0_SGPR1); + // For code object version 5, queue_ptr is passed through implicit kernarg. + if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { + AMDGPUTargetLowering::ImplicitParameter Param = + AMDGPUTargetLowering::QUEUE_PTR; + uint64_t Offset = + ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); + + Register KernargPtrReg = MRI.createGenericVirtualRegister( + LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); + + if (!loadInputValue(KernargPtrReg, B, + AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) + return false; + + // TODO: can we be smarter about machine pointer info? + MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); + MachineMemOperand *MMO = MF.getMachineMemOperand( + PtrInfo, + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + LLT::scalar(64), commonAlignment(Align(64), Offset)); + + // Pointer address + Register LoadAddr; + B.materializePtrAdd(LoadAddr, KernargPtrReg, LLT::scalar(64), Offset); + // Load address + Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); + B.buildCopy(SGPR01, Temp); + B.buildInstr(AMDGPU::S_TRAP) + .addImm(static_cast(GCNSubtarget::TrapID::LLVMAMDHSATrap)) + .addReg(SGPR01, RegState::Implicit); + MI.eraseFromParent(); + return true; + } + // Pass queue pointer to trap handler as input, and insert trap instruction // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi Register LiveIn = @@ -4822,7 +4891,6 @@ if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) return false; - Register SGPR01(AMDGPU::SGPR0_SGPR1); B.buildCopy(SGPR01, LiveIn); B.buildInstr(AMDGPU::S_TRAP) .addImm(static_cast(GCNSubtarget::TrapID::LLVMAMDHSATrap)) Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5468,18 +5468,29 @@ SDValue Chain = Op.getOperand(0); MachineFunction &MF = DAG.getMachineFunction(); - SIMachineFunctionInfo *Info = MF.getInfo(); - Register UserSGPR = Info->getQueuePtrUserSGPR(); - SDValue QueuePtr; - if (UserSGPR == AMDGPU::NoRegister) { - // We probably are in a function incorrectly marked with - // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the trap, - // so just use a null pointer. - QueuePtr = DAG.getConstant(0, SL, MVT::i64); + // For code object version 5, QueuePtr is passed through implicit kernarg. + if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { + uint64_t Offset = getImplicitParameterOffset(MF, QUEUE_PTR); + SDValue ArgPtr = lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset); + ArgPtr.dump(); + MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); + QueuePtr = DAG.getLoad(MVT::i64, SL, DAG.getEntryNode(), ArgPtr, PtrInfo, Align(8), + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant); } else { - QueuePtr = CreateLiveInRegister( - DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); + SIMachineFunctionInfo *Info = MF.getInfo(); + Register UserSGPR = Info->getQueuePtrUserSGPR(); + + if (UserSGPR == AMDGPU::NoRegister) { + // We probably are in a function incorrectly marked with + // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the trap, + // so just use a null pointer. + QueuePtr = DAG.getConstant(0, SL, MVT::i64); + } else { + QueuePtr = CreateLiveInRegister( + DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); + } } SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64); @@ -5556,6 +5567,20 @@ } MachineFunction &MF = DAG.getMachineFunction(); + MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); + + // For code object version 5, private_base and shared_base are passed through + // implicit kernargs. + if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { + ImplicitParameter Param = (AS == AMDGPUAS::LOCAL_ADDRESS) ? + SHARED_BASE : PRIVATE_BASE; + uint64_t Offset = getImplicitParameterOffset(MF, Param); + SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset); + return DAG.getLoad(MVT::i32, DL, DAG.getEntryNode(), Ptr, PtrInfo, Align(4), + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant); + } + SIMachineFunctionInfo *Info = MF.getInfo(); Register UserSGPR = Info->getQueuePtrUserSGPR(); if (UserSGPR == AMDGPU::NoRegister) { @@ -5577,7 +5602,6 @@ // TODO: Use custom target PseudoSourceValue. // TODO: We should use the value from the IR intrinsic call, but it might not // be available and how do we get it? - MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo, commonAlignment(Align(64), StructOffset), MachineMemOperand::MODereferenceable | Index: llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -57,6 +57,9 @@ /// \returns The offset of the hostcall pointer argument from implicitarg_ptr unsigned getHostcallImplicitArgPosition(); +/// \returns amdhsa code object version. +unsigned getAmdhsaCodeObjectVersion(); + struct GcnBufferFormatInfo { unsigned Format; unsigned BitsPerComp; Index: llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -152,6 +152,10 @@ } } +unsigned getAmdhsaCodeObjectVersion() { + return AmdhsaCodeObjectVersion; +} + #define GET_MIMGBaseOpcodesTable_IMPL #define GET_MIMGDimInfoTable_IMPL #define GET_MIMGInfoTable_IMPL Index: llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll @@ -0,0 +1,549 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=3 < %s | FileCheck --check-prefix=GFX8V3 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=4 < %s | FileCheck --check-prefix=GFX8V4 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefix=GFX8V5 %s + +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --amdhsa-code-object-version=3 < %s | FileCheck --check-prefixes=GFX9V3 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --amdhsa-code-object-version=4 < %s | FileCheck --check-prefixes=GFX9V4 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefixes=GFX9V5 %s + +define amdgpu_kernel void @addrspacecast(i32 addrspace(5)* %ptr.private, i32 addrspace(3)* %ptr.local) { +; GFX8V3-LABEL: addrspacecast: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V3-NEXT: s_load_dword s3, s[4:5], 0x44 +; GFX8V3-NEXT: s_load_dword s5, s[4:5], 0x40 +; GFX8V3-NEXT: v_mov_b32_e32 v2, 1 +; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT: s_mov_b32 s2, s0 +; GFX8V3-NEXT: s_cmp_lg_u32 s0, -1 +; GFX8V3-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX8V3-NEXT: s_mov_b32 s4, s1 +; GFX8V3-NEXT: s_cmp_lg_u32 s1, -1 +; GFX8V3-NEXT: v_mov_b32_e32 v0, s2 +; GFX8V3-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s3 +; GFX8V3-NEXT: flat_store_dword v[0:1], v2 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V3-NEXT: v_mov_b32_e32 v2, 2 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V3-NEXT: flat_store_dword v[0:1], v2 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: s_endpgm +; +; GFX8V4-LABEL: addrspacecast: +; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V4-NEXT: s_load_dword s3, s[4:5], 0x44 +; GFX8V4-NEXT: s_load_dword s5, s[4:5], 0x40 +; GFX8V4-NEXT: v_mov_b32_e32 v2, 1 +; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V4-NEXT: s_mov_b32 s2, s0 +; GFX8V4-NEXT: s_cmp_lg_u32 s0, -1 +; GFX8V4-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX8V4-NEXT: s_mov_b32 s4, s1 +; GFX8V4-NEXT: s_cmp_lg_u32 s1, -1 +; GFX8V4-NEXT: v_mov_b32_e32 v0, s2 +; GFX8V4-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s3 +; GFX8V4-NEXT: flat_store_dword v[0:1], v2 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V4-NEXT: v_mov_b32_e32 v2, 2 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V4-NEXT: flat_store_dword v[0:1], v2 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: s_endpgm +; +; GFX8V5-LABEL: addrspacecast: +; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V5-NEXT: s_load_dword s3, s[6:7], 0xc8 +; GFX8V5-NEXT: s_load_dword s5, s[6:7], 0xcc +; GFX8V5-NEXT: v_mov_b32_e32 v2, 1 +; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V5-NEXT: s_mov_b32 s2, s0 +; GFX8V5-NEXT: s_cmp_lg_u32 s0, -1 +; GFX8V5-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX8V5-NEXT: s_mov_b32 s4, s1 +; GFX8V5-NEXT: s_cmp_lg_u32 s1, -1 +; GFX8V5-NEXT: v_mov_b32_e32 v0, s2 +; GFX8V5-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 +; GFX8V5-NEXT: v_mov_b32_e32 v1, s3 +; GFX8V5-NEXT: flat_store_dword v[0:1], v2 +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V5-NEXT: v_mov_b32_e32 v2, 2 +; GFX8V5-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V5-NEXT: flat_store_dword v[0:1], v2 +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: s_endpgm +; +; GFX9V3-LABEL: addrspacecast: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V3-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX9V3-NEXT: s_lshl_b32 s3, s2, 16 +; GFX9V3-NEXT: s_getreg_b32 s4, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V3-NEXT: v_mov_b32_e32 v2, 1 +; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V3-NEXT: s_mov_b32 s2, s0 +; GFX9V3-NEXT: s_cmp_lg_u32 s0, -1 +; GFX9V3-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX9V3-NEXT: s_lshl_b32 s5, s4, 16 +; GFX9V3-NEXT: s_mov_b32 s4, s1 +; GFX9V3-NEXT: s_cmp_lg_u32 s1, -1 +; GFX9V3-NEXT: v_mov_b32_e32 v0, s2 +; GFX9V3-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 +; GFX9V3-NEXT: v_mov_b32_e32 v1, s3 +; GFX9V3-NEXT: flat_store_dword v[0:1], v2 +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V3-NEXT: v_mov_b32_e32 v2, 2 +; GFX9V3-NEXT: v_mov_b32_e32 v1, s1 +; GFX9V3-NEXT: flat_store_dword v[0:1], v2 +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: s_endpgm +; +; GFX9V4-LABEL: addrspacecast: +; GFX9V4: ; %bb.0: +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V4-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX9V4-NEXT: s_lshl_b32 s3, s2, 16 +; GFX9V4-NEXT: s_getreg_b32 s4, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V4-NEXT: v_mov_b32_e32 v2, 1 +; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V4-NEXT: s_mov_b32 s2, s0 +; GFX9V4-NEXT: s_cmp_lg_u32 s0, -1 +; GFX9V4-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX9V4-NEXT: s_lshl_b32 s5, s4, 16 +; GFX9V4-NEXT: s_mov_b32 s4, s1 +; GFX9V4-NEXT: s_cmp_lg_u32 s1, -1 +; GFX9V4-NEXT: v_mov_b32_e32 v0, s2 +; GFX9V4-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 +; GFX9V4-NEXT: v_mov_b32_e32 v1, s3 +; GFX9V4-NEXT: flat_store_dword v[0:1], v2 +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V4-NEXT: v_mov_b32_e32 v2, 2 +; GFX9V4-NEXT: v_mov_b32_e32 v1, s1 +; GFX9V4-NEXT: flat_store_dword v[0:1], v2 +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: s_endpgm +; +; GFX9V5-LABEL: addrspacecast: +; GFX9V5: ; %bb.0: +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V5-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX9V5-NEXT: s_lshl_b32 s3, s2, 16 +; GFX9V5-NEXT: s_getreg_b32 s4, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V5-NEXT: v_mov_b32_e32 v2, 1 +; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V5-NEXT: s_mov_b32 s2, s0 +; GFX9V5-NEXT: s_cmp_lg_u32 s0, -1 +; GFX9V5-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX9V5-NEXT: s_lshl_b32 s5, s4, 16 +; GFX9V5-NEXT: s_mov_b32 s4, s1 +; GFX9V5-NEXT: s_cmp_lg_u32 s1, -1 +; GFX9V5-NEXT: v_mov_b32_e32 v0, s2 +; GFX9V5-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 +; GFX9V5-NEXT: v_mov_b32_e32 v1, s3 +; GFX9V5-NEXT: flat_store_dword v[0:1], v2 +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V5-NEXT: v_mov_b32_e32 v2, 2 +; GFX9V5-NEXT: v_mov_b32_e32 v1, s1 +; GFX9V5-NEXT: flat_store_dword v[0:1], v2 +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: s_endpgm + %flat.private = addrspacecast i32 addrspace(5)* %ptr.private to i32* + %flat.local = addrspacecast i32 addrspace(3)* %ptr.local to i32* + store volatile i32 1, i32* %flat.private + store volatile i32 2, i32* %flat.local + ret void +} + +define amdgpu_kernel void @llvm_amdgcn_is_shared(i8* %ptr) { +; GFX8V3-LABEL: llvm_amdgcn_is_shared: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT: s_load_dword s0, s[4:5], 0x40 +; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V3-NEXT: s_cselect_b32 s0, 1, 0 +; GFX8V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V3-NEXT: flat_store_dword v[0:1], v0 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: s_endpgm +; +; GFX8V4-LABEL: llvm_amdgcn_is_shared: +; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x40 +; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0 +; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V4-NEXT: flat_store_dword v[0:1], v0 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: s_endpgm +; +; GFX8V5-LABEL: llvm_amdgcn_is_shared: +; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xcc +; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V5-NEXT: s_cselect_b32 s0, 1, 0 +; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V5-NEXT: flat_store_dword v[0:1], v0 +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: s_endpgm +; +; GFX9V3-LABEL: llvm_amdgcn_is_shared: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V3-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V3-NEXT: s_lshl_b32 s0, s0, 16 +; GFX9V3-NEXT: s_cmp_eq_u32 s1, s0 +; GFX9V3-NEXT: s_cselect_b32 s0, 1, 0 +; GFX9V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V3-NEXT: global_store_dword v[0:1], v0, off +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: s_endpgm +; +; GFX9V4-LABEL: llvm_amdgcn_is_shared: +; GFX9V4: ; %bb.0: +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V4-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V4-NEXT: s_lshl_b32 s0, s0, 16 +; GFX9V4-NEXT: s_cmp_eq_u32 s1, s0 +; GFX9V4-NEXT: s_cselect_b32 s0, 1, 0 +; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V4-NEXT: global_store_dword v[0:1], v0, off +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: s_endpgm +; +; GFX9V5-LABEL: llvm_amdgcn_is_shared: +; GFX9V5: ; %bb.0: +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V5-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V5-NEXT: s_lshl_b32 s0, s0, 16 +; GFX9V5-NEXT: s_cmp_eq_u32 s1, s0 +; GFX9V5-NEXT: s_cselect_b32 s0, 1, 0 +; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V5-NEXT: global_store_dword v[0:1], v0, off +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: s_endpgm + %is.shared = call i1 @llvm.amdgcn.is.shared(i8* %ptr) + %zext = zext i1 %is.shared to i32 + store volatile i32 %zext, i32 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @llvm_amdgcn_is_private(i8* %ptr) { +; GFX8V3-LABEL: llvm_amdgcn_is_private: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT: s_load_dword s0, s[4:5], 0x44 +; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V3-NEXT: s_cselect_b32 s0, 1, 0 +; GFX8V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V3-NEXT: flat_store_dword v[0:1], v0 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: s_endpgm +; +; GFX8V4-LABEL: llvm_amdgcn_is_private: +; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x44 +; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0 +; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V4-NEXT: flat_store_dword v[0:1], v0 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: s_endpgm +; +; GFX8V5-LABEL: llvm_amdgcn_is_private: +; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xc8 +; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V5-NEXT: s_cselect_b32 s0, 1, 0 +; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V5-NEXT: flat_store_dword v[0:1], v0 +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: s_endpgm +; +; GFX9V3-LABEL: llvm_amdgcn_is_private: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V3-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX9V3-NEXT: s_lshl_b32 s0, s0, 16 +; GFX9V3-NEXT: s_cmp_eq_u32 s1, s0 +; GFX9V3-NEXT: s_cselect_b32 s0, 1, 0 +; GFX9V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V3-NEXT: global_store_dword v[0:1], v0, off +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: s_endpgm +; +; GFX9V4-LABEL: llvm_amdgcn_is_private: +; GFX9V4: ; %bb.0: +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V4-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX9V4-NEXT: s_lshl_b32 s0, s0, 16 +; GFX9V4-NEXT: s_cmp_eq_u32 s1, s0 +; GFX9V4-NEXT: s_cselect_b32 s0, 1, 0 +; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V4-NEXT: global_store_dword v[0:1], v0, off +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: s_endpgm +; +; GFX9V5-LABEL: llvm_amdgcn_is_private: +; GFX9V5: ; %bb.0: +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V5-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX9V5-NEXT: s_lshl_b32 s0, s0, 16 +; GFX9V5-NEXT: s_cmp_eq_u32 s1, s0 +; GFX9V5-NEXT: s_cselect_b32 s0, 1, 0 +; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V5-NEXT: global_store_dword v[0:1], v0, off +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: s_endpgm + %is.private = call i1 @llvm.amdgcn.is.private(i8* %ptr) + %zext = zext i1 %is.private to i32 + store volatile i32 %zext, i32 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @llvm_trap() { +; GFX8V3-LABEL: llvm_trap: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8V3-NEXT: s_trap 2 +; +; GFX8V4-LABEL: llvm_trap: +; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8V4-NEXT: s_trap 2 +; +; GFX8V5-LABEL: llvm_trap: +; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xc8 +; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V5-NEXT: s_trap 2 +; +; GFX9V3-LABEL: llvm_trap: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX9V3-NEXT: s_trap 2 +; +; GFX9V4-LABEL: llvm_trap: +; GFX9V4: ; %bb.0: +; GFX9V4-NEXT: s_trap 2 +; +; GFX9V5-LABEL: llvm_trap: +; GFX9V5: ; %bb.0: +; GFX9V5-NEXT: s_trap 2 + call void @llvm.trap() + unreachable +} + +define amdgpu_kernel void @llvm_debugtrap() { +; GFX8V3-LABEL: llvm_debugtrap: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_trap 3 +; +; GFX8V4-LABEL: llvm_debugtrap: +; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: s_trap 3 +; +; GFX8V5-LABEL: llvm_debugtrap: +; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_trap 3 +; +; GFX9V3-LABEL: llvm_debugtrap: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_trap 3 +; +; GFX9V4-LABEL: llvm_debugtrap: +; GFX9V4: ; %bb.0: +; GFX9V4-NEXT: s_trap 3 +; +; GFX9V5-LABEL: llvm_debugtrap: +; GFX9V5: ; %bb.0: +; GFX9V5-NEXT: s_trap 3 + call void @llvm.debugtrap() + unreachable +} + +define amdgpu_kernel void @llvm_amdgcn_queue_ptr(i64 addrspace(1)* %ptr) { +; GFX8V3-LABEL: llvm_amdgcn_queue_ptr: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: v_mov_b32_e32 v0, s6 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s7 +; GFX8V3-NEXT: s_add_u32 s0, s8, 8 +; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V3-NEXT: s_addc_u32 s1, s9, 0 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: v_mov_b32_e32 v0, s4 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s5 +; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: v_mov_b32_e32 v0, s10 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s11 +; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT: v_mov_b32_e32 v3, s1 +; GFX8V3-NEXT: v_mov_b32_e32 v2, s0 +; GFX8V3-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: s_endpgm +; +; GFX8V4-LABEL: llvm_amdgcn_queue_ptr: +; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: v_mov_b32_e32 v0, s6 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s7 +; GFX8V4-NEXT: s_add_u32 s0, s8, 8 +; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V4-NEXT: s_addc_u32 s1, s9, 0 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: v_mov_b32_e32 v0, s4 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s5 +; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: v_mov_b32_e32 v0, s10 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s11 +; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V4-NEXT: v_mov_b32_e32 v3, s1 +; GFX8V4-NEXT: v_mov_b32_e32 v2, s0 +; GFX8V4-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: s_endpgm +; +; GFX8V5-LABEL: llvm_amdgcn_queue_ptr: +; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: v_mov_b32_e32 v0, s6 +; GFX8V5-NEXT: v_mov_b32_e32 v1, s7 +; GFX8V5-NEXT: s_add_u32 s0, s8, 8 +; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V5-NEXT: s_addc_u32 s1, s9, 0 +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V5-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: v_mov_b32_e32 v0, s4 +; GFX8V5-NEXT: v_mov_b32_e32 v1, s5 +; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: v_mov_b32_e32 v0, s10 +; GFX8V5-NEXT: v_mov_b32_e32 v1, s11 +; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V5-NEXT: v_mov_b32_e32 v3, s1 +; GFX8V5-NEXT: v_mov_b32_e32 v2, s0 +; GFX8V5-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: s_endpgm +; +; GFX9V3-LABEL: llvm_amdgcn_queue_ptr: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: v_mov_b32_e32 v2, 0 +; GFX9V3-NEXT: global_load_ubyte v0, v2, s[6:7] glc +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: global_load_ubyte v0, v2, s[4:5] glc +; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: v_mov_b32_e32 v0, s10 +; GFX9V3-NEXT: v_mov_b32_e32 v1, s11 +; GFX9V3-NEXT: ; kill: killed $sgpr6_sgpr7 +; GFX9V3-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V3-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: s_endpgm +; +; GFX9V4-LABEL: llvm_amdgcn_queue_ptr: +; GFX9V4: ; %bb.0: +; GFX9V4-NEXT: v_mov_b32_e32 v2, 0 +; GFX9V4-NEXT: global_load_ubyte v0, v2, s[6:7] glc +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: global_load_ubyte v0, v2, s[4:5] glc +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: v_mov_b32_e32 v0, s10 +; GFX9V4-NEXT: v_mov_b32_e32 v1, s11 +; GFX9V4-NEXT: ; kill: killed $sgpr6_sgpr7 +; GFX9V4-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: s_endpgm +; +; GFX9V5-LABEL: llvm_amdgcn_queue_ptr: +; GFX9V5: ; %bb.0: +; GFX9V5-NEXT: v_mov_b32_e32 v2, 0 +; GFX9V5-NEXT: global_load_ubyte v0, v2, s[6:7] glc +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: global_load_ubyte v0, v2, s[4:5] glc +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: v_mov_b32_e32 v0, s10 +; GFX9V5-NEXT: v_mov_b32_e32 v1, s11 +; GFX9V5-NEXT: ; kill: killed $sgpr6_sgpr7 +; GFX9V5-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V5-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: s_endpgm + %queue.ptr = call i8 addrspace(4)* @llvm.amdgcn.queue.ptr() + %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %dispatch.id = call i64 @llvm.amdgcn.dispatch.id() + %queue.load = load volatile i8, i8 addrspace(4)* %queue.ptr + %implicitarg.load = load volatile i8, i8 addrspace(4)* %implicitarg.ptr + %dispatch.load = load volatile i8, i8 addrspace(4)* %dispatch.ptr + store volatile i64 %dispatch.id, i64 addrspace(1)* %ptr + ret void +} + +declare noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() +declare noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() +declare i64 @llvm.amdgcn.dispatch.id() +declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() +declare i1 @llvm.amdgcn.is.shared(i8*) +declare i1 @llvm.amdgcn.is.private(i8*) +declare void @llvm.trap() +declare void @llvm.debugtrap() Index: llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll @@ -0,0 +1,552 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=3 < %s | FileCheck --check-prefix=GFX8V3 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=4 < %s | FileCheck --check-prefix=GFX8V4 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefix=GFX8V5 %s + +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --amdhsa-code-object-version=3 < %s | FileCheck --check-prefixes=GFX9V3 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --amdhsa-code-object-version=4 < %s | FileCheck --check-prefixes=GFX9V4 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefixes=GFX9V5 %s + +define amdgpu_kernel void @addrspacecast(i32 addrspace(5)* %ptr.private, i32 addrspace(3)* %ptr.local) { +; GFX8V3-LABEL: addrspacecast: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V3-NEXT: s_load_dword s2, s[4:5], 0x44 +; GFX8V3-NEXT: s_load_dword s3, s[4:5], 0x40 +; GFX8V3-NEXT: v_mov_b32_e32 v4, 1 +; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT: s_cmp_lg_u32 s0, -1 +; GFX8V3-NEXT: v_mov_b32_e32 v0, s2 +; GFX8V3-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX8V3-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; GFX8V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V3-NEXT: s_cmp_lg_u32 s1, -1 +; GFX8V3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8V3-NEXT: v_mov_b32_e32 v2, s3 +; GFX8V3-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX8V3-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; GFX8V3-NEXT: v_mov_b32_e32 v2, s1 +; GFX8V3-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX8V3-NEXT: flat_store_dword v[0:1], v4 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: v_mov_b32_e32 v0, 2 +; GFX8V3-NEXT: flat_store_dword v[2:3], v0 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: s_endpgm +; +; GFX8V4-LABEL: addrspacecast: +; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V4-NEXT: s_load_dword s2, s[4:5], 0x44 +; GFX8V4-NEXT: s_load_dword s3, s[4:5], 0x40 +; GFX8V4-NEXT: v_mov_b32_e32 v4, 1 +; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V4-NEXT: s_cmp_lg_u32 s0, -1 +; GFX8V4-NEXT: v_mov_b32_e32 v0, s2 +; GFX8V4-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX8V4-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V4-NEXT: s_cmp_lg_u32 s1, -1 +; GFX8V4-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8V4-NEXT: v_mov_b32_e32 v2, s3 +; GFX8V4-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX8V4-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; GFX8V4-NEXT: v_mov_b32_e32 v2, s1 +; GFX8V4-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX8V4-NEXT: flat_store_dword v[0:1], v4 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: v_mov_b32_e32 v0, 2 +; GFX8V4-NEXT: flat_store_dword v[2:3], v0 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: s_endpgm +; +; GFX8V5-LABEL: addrspacecast: +; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX8V5-NEXT: s_load_dword s2, s[6:7], 0xc8 +; GFX8V5-NEXT: s_load_dword s3, s[6:7], 0xcc +; GFX8V5-NEXT: v_mov_b32_e32 v4, 1 +; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V5-NEXT: s_cmp_lg_u32 s0, -1 +; GFX8V5-NEXT: v_mov_b32_e32 v0, s2 +; GFX8V5-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX8V5-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V5-NEXT: s_cmp_lg_u32 s1, -1 +; GFX8V5-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8V5-NEXT: v_mov_b32_e32 v2, s3 +; GFX8V5-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX8V5-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; GFX8V5-NEXT: v_mov_b32_e32 v2, s1 +; GFX8V5-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX8V5-NEXT: flat_store_dword v[0:1], v4 +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: v_mov_b32_e32 v0, 2 +; GFX8V5-NEXT: flat_store_dword v[2:3], v0 +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: s_endpgm +; +; GFX9V3-LABEL: addrspacecast: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V3-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX9V3-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9V3-NEXT: v_mov_b32_e32 v0, s2 +; GFX9V3-NEXT: v_mov_b32_e32 v4, 1 +; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V3-NEXT: s_cmp_lg_u32 s0, -1 +; GFX9V3-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9V3-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; GFX9V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V3-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V3-NEXT: s_lshl_b32 s0, s0, 16 +; GFX9V3-NEXT: s_cmp_lg_u32 s1, -1 +; GFX9V3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9V3-NEXT: v_mov_b32_e32 v2, s0 +; GFX9V3-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9V3-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; GFX9V3-NEXT: v_mov_b32_e32 v2, s1 +; GFX9V3-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9V3-NEXT: flat_store_dword v[0:1], v4 +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: v_mov_b32_e32 v0, 2 +; GFX9V3-NEXT: flat_store_dword v[2:3], v0 +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: s_endpgm +; +; GFX9V4-LABEL: addrspacecast: +; GFX9V4: ; %bb.0: +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V4-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX9V4-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9V4-NEXT: v_mov_b32_e32 v0, s2 +; GFX9V4-NEXT: v_mov_b32_e32 v4, 1 +; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V4-NEXT: s_cmp_lg_u32 s0, -1 +; GFX9V4-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9V4-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; GFX9V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V4-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V4-NEXT: s_lshl_b32 s0, s0, 16 +; GFX9V4-NEXT: s_cmp_lg_u32 s1, -1 +; GFX9V4-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9V4-NEXT: v_mov_b32_e32 v2, s0 +; GFX9V4-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9V4-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; GFX9V4-NEXT: v_mov_b32_e32 v2, s1 +; GFX9V4-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9V4-NEXT: flat_store_dword v[0:1], v4 +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: v_mov_b32_e32 v0, 2 +; GFX9V4-NEXT: flat_store_dword v[2:3], v0 +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: s_endpgm +; +; GFX9V5-LABEL: addrspacecast: +; GFX9V5: ; %bb.0: +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9V5-NEXT: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX9V5-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9V5-NEXT: v_mov_b32_e32 v0, s2 +; GFX9V5-NEXT: v_mov_b32_e32 v4, 1 +; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V5-NEXT: s_cmp_lg_u32 s0, -1 +; GFX9V5-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9V5-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; GFX9V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX9V5-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V5-NEXT: s_lshl_b32 s0, s0, 16 +; GFX9V5-NEXT: s_cmp_lg_u32 s1, -1 +; GFX9V5-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9V5-NEXT: v_mov_b32_e32 v2, s0 +; GFX9V5-NEXT: s_cselect_b64 vcc, -1, 0 +; GFX9V5-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; GFX9V5-NEXT: v_mov_b32_e32 v2, s1 +; GFX9V5-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9V5-NEXT: flat_store_dword v[0:1], v4 +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: v_mov_b32_e32 v0, 2 +; GFX9V5-NEXT: flat_store_dword v[2:3], v0 +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: s_endpgm + %flat.private = addrspacecast i32 addrspace(5)* %ptr.private to i32* + %flat.local = addrspacecast i32 addrspace(3)* %ptr.local to i32* + store volatile i32 1, i32* %flat.private + store volatile i32 2, i32* %flat.local + ret void +} + +define amdgpu_kernel void @llvm_amdgcn_is_shared(i8* %ptr) { +; GFX8V3-LABEL: llvm_amdgcn_is_shared: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_load_dword s0, s[4:5], 0x40 +; GFX8V3-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V3-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX8V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8V3-NEXT: flat_store_dword v[0:1], v0 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: s_endpgm +; +; GFX8V4-LABEL: llvm_amdgcn_is_shared: +; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x40 +; GFX8V4-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX8V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8V4-NEXT: flat_store_dword v[0:1], v0 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: s_endpgm +; +; GFX8V5-LABEL: llvm_amdgcn_is_shared: +; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xcc +; GFX8V5-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX8V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8V5-NEXT: flat_store_dword v[0:1], v0 +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: s_endpgm +; +; GFX9V3-LABEL: llvm_amdgcn_is_shared: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_load_dword s0, s[6:7], 0x4 +; GFX9V3-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V3-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V3-NEXT: s_cmp_eq_u32 s0, s1 +; GFX9V3-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9V3-NEXT: global_store_dword v[0:1], v0, off +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: s_endpgm +; +; GFX9V4-LABEL: llvm_amdgcn_is_shared: +; GFX9V4: ; %bb.0: +; GFX9V4-NEXT: s_load_dword s0, s[6:7], 0x4 +; GFX9V4-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V4-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V4-NEXT: s_cmp_eq_u32 s0, s1 +; GFX9V4-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9V4-NEXT: global_store_dword v[0:1], v0, off +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: s_endpgm +; +; GFX9V5-LABEL: llvm_amdgcn_is_shared: +; GFX9V5: ; %bb.0: +; GFX9V5-NEXT: s_load_dword s0, s[6:7], 0x4 +; GFX9V5-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX9V5-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V5-NEXT: s_cmp_eq_u32 s0, s1 +; GFX9V5-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9V5-NEXT: global_store_dword v[0:1], v0, off +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: s_endpgm + %is.shared = call i1 @llvm.amdgcn.is.shared(i8* %ptr) + %zext = zext i1 %is.shared to i32 + store volatile i32 %zext, i32 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @llvm_amdgcn_is_private(i8* %ptr) { +; GFX8V3-LABEL: llvm_amdgcn_is_private: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_load_dword s0, s[4:5], 0x44 +; GFX8V3-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V3-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V3-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX8V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8V3-NEXT: flat_store_dword v[0:1], v0 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: s_endpgm +; +; GFX8V4-LABEL: llvm_amdgcn_is_private: +; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: s_load_dword s0, s[4:5], 0x44 +; GFX8V4-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX8V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8V4-NEXT: flat_store_dword v[0:1], v0 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: s_endpgm +; +; GFX8V5-LABEL: llvm_amdgcn_is_private: +; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_load_dword s0, s[6:7], 0xc8 +; GFX8V5-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0 +; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX8V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8V5-NEXT: flat_store_dword v[0:1], v0 +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: s_endpgm +; +; GFX9V3-LABEL: llvm_amdgcn_is_private: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_load_dword s0, s[6:7], 0x4 +; GFX9V3-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX9V3-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V3-NEXT: s_cmp_eq_u32 s0, s1 +; GFX9V3-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9V3-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9V3-NEXT: global_store_dword v[0:1], v0, off +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: s_endpgm +; +; GFX9V4-LABEL: llvm_amdgcn_is_private: +; GFX9V4: ; %bb.0: +; GFX9V4-NEXT: s_load_dword s0, s[6:7], 0x4 +; GFX9V4-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX9V4-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V4-NEXT: s_cmp_eq_u32 s0, s1 +; GFX9V4-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9V4-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9V4-NEXT: global_store_dword v[0:1], v0, off +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: s_endpgm +; +; GFX9V5-LABEL: llvm_amdgcn_is_private: +; GFX9V5: ; %bb.0: +; GFX9V5-NEXT: s_load_dword s0, s[6:7], 0x4 +; GFX9V5-NEXT: s_getreg_b32 s1, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX9V5-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V5-NEXT: s_cmp_eq_u32 s0, s1 +; GFX9V5-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9V5-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9V5-NEXT: global_store_dword v[0:1], v0, off +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: s_endpgm + %is.private = call i1 @llvm.amdgcn.is.private(i8* %ptr) + %zext = zext i1 %is.private to i32 + store volatile i32 %zext, i32 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @llvm_trap() { +; GFX8V3-LABEL: llvm_trap: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8V3-NEXT: s_trap 2 +; +; GFX8V4-LABEL: llvm_trap: +; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX8V4-NEXT: s_trap 2 +; +; GFX8V5-LABEL: llvm_trap: +; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0xc8 +; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX8V5-NEXT: s_trap 2 +; +; GFX9V3-LABEL: llvm_trap: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_mov_b64 s[0:1], s[4:5] +; GFX9V3-NEXT: s_trap 2 +; +; GFX9V4-LABEL: llvm_trap: +; GFX9V4: ; %bb.0: +; GFX9V4-NEXT: s_trap 2 +; +; GFX9V5-LABEL: llvm_trap: +; GFX9V5: ; %bb.0: +; GFX9V5-NEXT: s_trap 2 + call void @llvm.trap() + unreachable +} + +define amdgpu_kernel void @llvm_debugtrap() { +; GFX8V3-LABEL: llvm_debugtrap: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: s_trap 3 +; +; GFX8V4-LABEL: llvm_debugtrap: +; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: s_trap 3 +; +; GFX8V5-LABEL: llvm_debugtrap: +; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: s_trap 3 +; +; GFX9V3-LABEL: llvm_debugtrap: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: s_trap 3 +; +; GFX9V4-LABEL: llvm_debugtrap: +; GFX9V4: ; %bb.0: +; GFX9V4-NEXT: s_trap 3 +; +; GFX9V5-LABEL: llvm_debugtrap: +; GFX9V5: ; %bb.0: +; GFX9V5-NEXT: s_trap 3 + call void @llvm.debugtrap() + unreachable +} + +define amdgpu_kernel void @llvm_amdgcn_queue_ptr(i64 addrspace(1)* %ptr) { +; GFX8V3-LABEL: llvm_amdgcn_queue_ptr: +; GFX8V3: ; %bb.0: +; GFX8V3-NEXT: v_mov_b32_e32 v0, s6 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s7 +; GFX8V3-NEXT: s_add_u32 s0, s8, 8 +; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V3-NEXT: s_addc_u32 s1, s9, 0 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: v_mov_b32_e32 v0, s4 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s5 +; GFX8V3-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8V3-NEXT: v_mov_b32_e32 v2, s10 +; GFX8V3-NEXT: v_mov_b32_e32 v3, s11 +; GFX8V3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8V3-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V3-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8V3-NEXT: s_waitcnt vmcnt(0) +; GFX8V3-NEXT: s_endpgm +; +; GFX8V4-LABEL: llvm_amdgcn_queue_ptr: +; GFX8V4: ; %bb.0: +; GFX8V4-NEXT: v_mov_b32_e32 v0, s6 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s7 +; GFX8V4-NEXT: s_add_u32 s0, s8, 8 +; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V4-NEXT: s_addc_u32 s1, s9, 0 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: v_mov_b32_e32 v0, s4 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s5 +; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8V4-NEXT: v_mov_b32_e32 v2, s10 +; GFX8V4-NEXT: v_mov_b32_e32 v3, s11 +; GFX8V4-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V4-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8V4-NEXT: s_waitcnt vmcnt(0) +; GFX8V4-NEXT: s_endpgm +; +; GFX8V5-LABEL: llvm_amdgcn_queue_ptr: +; GFX8V5: ; %bb.0: +; GFX8V5-NEXT: v_mov_b32_e32 v0, s6 +; GFX8V5-NEXT: v_mov_b32_e32 v1, s7 +; GFX8V5-NEXT: s_add_u32 s0, s8, 8 +; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V5-NEXT: s_addc_u32 s1, s9, 0 +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V5-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: v_mov_b32_e32 v0, s4 +; GFX8V5-NEXT: v_mov_b32_e32 v1, s5 +; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc +; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX8V5-NEXT: v_mov_b32_e32 v2, s10 +; GFX8V5-NEXT: v_mov_b32_e32 v3, s11 +; GFX8V5-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 +; GFX8V5-NEXT: v_mov_b32_e32 v1, s1 +; GFX8V5-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8V5-NEXT: s_waitcnt vmcnt(0) +; GFX8V5-NEXT: s_endpgm +; +; GFX9V3-LABEL: llvm_amdgcn_queue_ptr: +; GFX9V3: ; %bb.0: +; GFX9V3-NEXT: v_mov_b32_e32 v2, 0 +; GFX9V3-NEXT: global_load_ubyte v0, v2, s[6:7] glc +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: global_load_ubyte v0, v2, s[4:5] glc +; GFX9V3-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: v_mov_b32_e32 v0, s10 +; GFX9V3-NEXT: v_mov_b32_e32 v1, s11 +; GFX9V3-NEXT: ; kill: killed $sgpr6_sgpr7 +; GFX9V3-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX9V3-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V3-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9V3-NEXT: s_waitcnt vmcnt(0) +; GFX9V3-NEXT: s_endpgm +; +; GFX9V4-LABEL: llvm_amdgcn_queue_ptr: +; GFX9V4: ; %bb.0: +; GFX9V4-NEXT: v_mov_b32_e32 v2, 0 +; GFX9V4-NEXT: global_load_ubyte v0, v2, s[6:7] glc +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: global_load_ubyte v0, v2, s[4:5] glc +; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: v_mov_b32_e32 v0, s10 +; GFX9V4-NEXT: v_mov_b32_e32 v1, s11 +; GFX9V4-NEXT: ; kill: killed $sgpr6_sgpr7 +; GFX9V4-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX9V4-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V4-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9V4-NEXT: s_waitcnt vmcnt(0) +; GFX9V4-NEXT: s_endpgm +; +; GFX9V5-LABEL: llvm_amdgcn_queue_ptr: +; GFX9V5: ; %bb.0: +; GFX9V5-NEXT: v_mov_b32_e32 v2, 0 +; GFX9V5-NEXT: global_load_ubyte v0, v2, s[6:7] glc +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: global_load_ubyte v0, v2, s[8:9] offset:8 glc +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: global_load_ubyte v0, v2, s[4:5] glc +; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: v_mov_b32_e32 v0, s10 +; GFX9V5-NEXT: v_mov_b32_e32 v1, s11 +; GFX9V5-NEXT: ; kill: killed $sgpr6_sgpr7 +; GFX9V5-NEXT: ; kill: killed $sgpr4_sgpr5 +; GFX9V5-NEXT: s_waitcnt lgkmcnt(0) +; GFX9V5-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9V5-NEXT: s_waitcnt vmcnt(0) +; GFX9V5-NEXT: s_endpgm + %queue.ptr = call i8 addrspace(4)* @llvm.amdgcn.queue.ptr() + %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() + %dispatch.id = call i64 @llvm.amdgcn.dispatch.id() + %queue.load = load volatile i8, i8 addrspace(4)* %queue.ptr + %implicitarg.load = load volatile i8, i8 addrspace(4)* %implicitarg.ptr + %dispatch.load = load volatile i8, i8 addrspace(4)* %dispatch.ptr + store volatile i64 %dispatch.id, i64 addrspace(1)* %ptr + ret void +} + +declare noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() +declare noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() +declare i64 @llvm.amdgcn.dispatch.id() +declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() +declare i1 @llvm.amdgcn.is.shared(i8*) +declare i1 @llvm.amdgcn.is.private(i8*) +declare void @llvm.trap() +declare void @llvm.debugtrap()