Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -508,6 +508,7 @@ KernelInfo.ComputePGMRSrc1 | (KernelInfo.ComputePGMRSrc2 << 32); header.code_properties = + AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR | AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR | AMD_CODE_PROPERTY_IS_PTR64; Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -22,12 +22,15 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc DL, - SDValue Chain, unsigned Offset, bool Signed) const; + SDValue Chain, unsigned Offset, + unsigned BasePtrReg, bool Signed) const; SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const override; + SDValue LowerLocalSizeIntrinsic(SelectionDAG &DAG, SDLoc DL, + unsigned Dim) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -19,6 +19,7 @@ #endif #include "SIISelLowering.h" +#include "SIInstrInfo.h" #include "AMDGPU.h" #include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" @@ -466,12 +467,15 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc SL, SDValue Chain, - unsigned Offset, bool Signed) const { + unsigned Offset, unsigned BasePtrReg, + bool Signed) const { const DataLayout &DL = DAG.getDataLayout(); MachineFunction &MF = DAG.getMachineFunction(); const SIRegisterInfo *TRI = static_cast(Subtarget->getRegisterInfo()); - unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); + + if (!BasePtrReg) + BasePtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); Type *Ty = VT.getTypeForEVT(*DAG.getContext()); @@ -479,7 +483,7 @@ MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, - MRI.getLiveInVirtReg(InputPtrReg), PtrVT); + MRI.getLiveInVirtReg(BasePtrReg), PtrVT); SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, DAG.getConstant(Offset, SL, PtrVT)); SDValue PtrOffset = DAG.getUNDEF(PtrVT); @@ -572,7 +576,7 @@ // The pointer to the scratch buffer is stored in SGPR2, SGPR3 if (Info->getShaderType() == ShaderType::COMPUTE) { if (Subtarget->isAmdHsaOS()) - Info->NumUserSGPRs = 2; // FIXME: Need to support scratch buffers. + Info->NumUserSGPRs = 4; // FIXME: Need to support scratch buffers. else Info->NumUserSGPRs = 4; @@ -596,6 +600,17 @@ CCInfo.AllocateReg(ScratchPtrRegHi); MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass); + if (Subtarget->isAmdHsaOS()) { + unsigned DispatchPtrReg = + TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR); + unsigned DispatchPtrRegLo = + TRI->getPhysRegSubReg(DispatchPtrReg, &AMDGPU::SReg_32RegClass, 0); + unsigned DispatchPtrRegHi = + TRI->getPhysRegSubReg(DispatchPtrReg, &AMDGPU::SReg_32RegClass, 1); + CCInfo.AllocateReg(DispatchPtrRegLo); + CCInfo.AllocateReg(DispatchPtrRegHi); + MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass); + } } if (Info->getShaderType() == ShaderType::COMPUTE) { @@ -625,8 +640,10 @@ VA.getLocMemOffset(); // The first 36 bytes of the input buffer contains information about // thread group and global sizes. + unsigned InputPtrReg = TRI->getPreloadedValue(MF, + SIRegisterInfo::INPUT_PTR); SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, Chain, - Offset, Ins[i].Flags.isSExt()); + Offset, InputPtrReg, Ins[i].Flags.isSExt()); Chains.push_back(Arg.getValue(1)); auto *ParamTy = @@ -989,6 +1006,60 @@ // a glue result. } +SDValue SITargetLowering::LowerLocalSizeIntrinsic(SelectionDAG &DAG, + SDLoc DL, + unsigned Dim) const { + MachineFunction &MF = DAG.getMachineFunction(); + const SIRegisterInfo *TRI = + static_cast(Subtarget->getRegisterInfo()); + + unsigned Offset; + unsigned BasePtr; + EVT MemVT; + SDValue Param; + if (Subtarget->isAmdHsaOS()) { + BasePtr = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR); + + // Local size value are 16-bits, but we always load 32-bit values and + // then mask or shift to get the correct value. This allows use to + // load the data with SMRD instructions which is faster than using + // MUBUF instructions. + Offset = SI::DispatchPacketOffset::LOCAL_SIZE_X + (4 * (Dim >> 1)); + Param = LowerParameter(DAG, MVT::i32, MVT::i32, DL, DAG.getEntryNode(), + Offset, BasePtr, false); + + switch (Dim) { + case 0: + // Clear the high bits. + Param = DAG.getNode(ISD::AND, DL, MVT::i32, Param, + DAG.getConstant(0xffff, DL, MVT::i32)); + break; + case 1: + // Get local size y from the high bits. We can use SRL here, because + // the max value range is 0-256, so the sign bit will always be zero. + Param = DAG.getNode(ISD::SRL, DL, MVT::i32, Param, + DAG.getConstant(16, DL, MVT::i32)); + break; + case 2: + // Do nothing, the 16-bits after the z dimension size are always + // zero, so we don't need to clear them. AssertZext will be added + // below. + break; + } + } else { + + BasePtr = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); + Offset = SI::KernelInputOffsets::LOCAL_SIZE_X + (Dim * 4); + + Param = LowerParameter(DAG, MVT::i32, MVT::i32, DL, DAG.getEntryNode(), + Offset, BasePtr, false); + } + + // The local size values will have the hi 16-bits as zero. + return DAG.getNode(ISD::AssertZext, DL, MVT::i32, Param, + DAG.getValueType(MVT::i16)); +} + SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -999,41 +1070,46 @@ EVT VT = Op.getValueType(); SDLoc DL(Op); unsigned IntrinsicID = cast(Op.getOperand(0))->getZExtValue(); + unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); // TODO: Should this propagate fast-math-flags? switch (IntrinsicID) { case Intrinsic::r600_read_ngroups_x: return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_X, false); + SI::KernelInputOffsets::NGROUPS_X, + InputPtrReg, false); case Intrinsic::r600_read_ngroups_y: return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Y, false); + SI::KernelInputOffsets::NGROUPS_Y, + InputPtrReg, false); case Intrinsic::r600_read_ngroups_z: return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Z, false); + SI::KernelInputOffsets::NGROUPS_Z, + InputPtrReg, false); case Intrinsic::r600_read_global_size_x: return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_X, false); + SI::KernelInputOffsets::GLOBAL_SIZE_X, + InputPtrReg, false); case Intrinsic::r600_read_global_size_y: return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); + SI::KernelInputOffsets::GLOBAL_SIZE_Y, + InputPtrReg, false); case Intrinsic::r600_read_global_size_z: return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); + SI::KernelInputOffsets::GLOBAL_SIZE_Z, + InputPtrReg, false); case Intrinsic::r600_read_local_size_x: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::LOCAL_SIZE_X, false); + return LowerLocalSizeIntrinsic(DAG, DL, 0); case Intrinsic::r600_read_local_size_y: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::LOCAL_SIZE_Y, false); + return LowerLocalSizeIntrinsic(DAG, DL, 1); case Intrinsic::r600_read_local_size_z: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::LOCAL_SIZE_Z, false); + return LowerLocalSizeIntrinsic(DAG, DL, 2); case Intrinsic::AMDGPU_read_workdim: return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - getImplicitParameterOffset(MFI, GRID_DIM), false); + getImplicitParameterOffset(MFI, GRID_DIM), + InputPtrReg, false); case Intrinsic::r600_read_tgid_x: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -387,6 +387,17 @@ } // End namespace AMDGPU namespace SI { + +namespace DispatchPacketOffset { + +enum { + LOCAL_SIZE_X = 4, + LOCAL_SIZE_Y = 6, + LOCAL_SIZE_Z = 8, +}; + +} + namespace KernelInputOffsets { /// Offsets in bytes from the start of the input buffer Index: lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.h +++ lib/Target/AMDGPU/SIRegisterInfo.h @@ -92,6 +92,7 @@ enum PreloadedValue { // SGPRS: SCRATCH_PTR = 0, + DISPATCH_PTR = 1, INPUT_PTR = 3, TGID_X = 10, TGID_Y = 11, Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -495,6 +495,7 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, enum PreloadedValue Value) const { + const AMDGPUSubtarget &STI = MF.getSubtarget(); const SIMachineFunctionInfo *MFI = MF.getInfo(); switch (Value) { case SIRegisterInfo::TGID_X: @@ -510,6 +511,8 @@ case SIRegisterInfo::SCRATCH_PTR: return AMDGPU::SGPR2_SGPR3; case SIRegisterInfo::INPUT_PTR: + return STI.isAmdHsaOS() ? AMDGPU::SGPR2_SGPR3 : AMDGPU::SGPR0_SGPR1; + case SIRegisterInfo::DISPATCH_PTR: return AMDGPU::SGPR0_SGPR1; case SIRegisterInfo::TIDIG_X: return AMDGPU::VGPR0; Index: test/CodeGen/AMDGPU/hsa.ll =================================================================== --- test/CodeGen/AMDGPU/hsa.ll +++ test/CodeGen/AMDGPU/hsa.ll @@ -39,7 +39,7 @@ ; HSA: {{^}}simple: ; HSA: .amd_kernel_code_t ; HSA: .end_amd_kernel_code_t -; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x0 +; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[2:3], 0x0 ; Make sure we are setting the ATC bit: ; HSA-CI: s_mov_b32 s[[HI:[0-9]]], 0x100f000 Index: test/CodeGen/AMDGPU/llvm.dbg.value.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.dbg.value.ll +++ test/CodeGen/AMDGPU/llvm.dbg.value.ll @@ -2,7 +2,7 @@ ; CHECK-LABEL: {{^}}test_debug_value: ; CHECK: s_load_dwordx2 -; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- SGPR0_SGPR1 +; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- SGPR2_SGPR3 ; CHECK: buffer_store_dword ; CHECK: s_endpgm define void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 { Index: test/CodeGen/AMDGPU/work-item-intrinsics.ll =================================================================== --- test/CodeGen/AMDGPU/work-item-intrinsics.ll +++ test/CodeGen/AMDGPU/work-item-intrinsics.ll @@ -1,5 +1,7 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=HSA -check-prefix=CI-HSA -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=HSA -check-prefix=VI-HSA -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s @@ -7,9 +9,9 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[0].X -; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; GCN-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @ngroups_x (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.ngroups.x() #0 @@ -21,10 +23,10 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[0].Y -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @ngroups_y (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.ngroups.y() #0 @@ -36,10 +38,10 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[0].Z -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @ngroups_z (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.ngroups.z() #0 @@ -51,10 +53,10 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[0].W -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @global_size_x (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.global.size.x() #0 @@ -66,10 +68,10 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[1].X -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @global_size_y (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.global.size.y() #0 @@ -81,10 +83,10 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[1].Y -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14 -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14 +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @global_size_z (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.global.size.z() #0 @@ -96,8 +98,11 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[1].Z -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18 +; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[0:1], 0x1 +; VI-HSA: s_load_dword [[XY:s[0-9]+]], s[0:1], 0x4 +; HSA: s_and_b32 [[VAL:s[0-9]+]], [[XY]], 0xffff ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[VVAL]] define void @local_size_x (i32 addrspace(1)* %out) { @@ -111,8 +116,11 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[1].W -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c +; CI-HSA: s_load_dword [[XY_VAL:s[0-9]+]], s[0:1], 0x1 +; VI-HSA: s_load_dword [[XY_VAL:s[0-9]+]], s[0:1], 0x4 +; HSA: s_lshr_b32 [[VAL:s[0-9]+]], [[XY_VAL]], 16 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[VVAL]] define void @local_size_y (i32 addrspace(1)* %out) { @@ -126,8 +134,10 @@ ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[2].X -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20 +; CI-HSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2 +; VI-HSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[VVAL]] define void @local_size_z (i32 addrspace(1)* %out) { @@ -137,14 +147,107 @@ ret void } +; FUNC-LABEL: {{^}}local_size_xy: +; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6 +; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7 +; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18 +; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c +; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[0:1], 0x1 +; VI-HSA; s_load_dword [[XY:s[0-9]+]], s[0:1], 0x4 +; HSA-DAG: s_and_b32 [[X:s[0-9]+]], [[XY]], 0xffff +; HSA-DAG: s_lshr_b32 [[Y:s[0-9]+]], [[XY]], 16 +; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]] +; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VY]] +; GCN: buffer_store_dword [[VAL]] +define void @local_size_xy (i32 addrspace(1)* %out) { +entry: + %x = call i32 @llvm.r600.read.local.size.x() #0 + %y = call i32 @llvm.r600.read.local.size.y() #0 + %val = mul i32 %x, %y + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_xz: +; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6 +; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 +; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18 +; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20 +; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[0:1], 0x1 +; CI-HSA: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x2 +; VI-HSA: s_load_dword [[XY:s[0-9]+]], s[0:1], 0x4 +; VI-HSA: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 +; HSA-DAG: s_and_b32 [[X:s[0-9]+]], [[XY]], 0xffff +; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] +; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VZ]] +; GCN: buffer_store_dword [[VAL]] +define void @local_size_xz (i32 addrspace(1)* %out) { +entry: + %x = call i32 @llvm.r600.read.local.size.x() #0 + %z = call i32 @llvm.r600.read.local.size.z() #0 + %val = mul i32 %x, %z + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_yz: +; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7 +; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 +; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c +; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20 +; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[0:1], 0x1 +; CI-HSA: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x2 +; VI-HSA: s_load_dword [[XY:s[0-9]+]], s[0:1], 0x4 +; VI-HSA: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 +; HSA-DAG: s_lshr_b32 [[Y:s[0-9]+]], [[XY]], 16 +; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] +; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[Y]], [[VZ]] +; GCN: buffer_store_dword [[VAL]] +define void @local_size_yz (i32 addrspace(1)* %out) { +entry: + %y = call i32 @llvm.r600.read.local.size.y() #0 + %z = call i32 @llvm.r600.read.local.size.z() #0 + %val = mul i32 %y, %z + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}local_size_xyz: +; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6 +; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7 +; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 +; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18 +; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c +; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20 +; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[0:1], 0x1 +; CI-HSA: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x2 +; VI-HSA: s_load_dword [[XY:s[0-9]+]], s[0:1], 0x4 +; VI-HSA: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 +; HSA-DAG: s_and_b32 [[X:s[0-9]+]], [[XY]], 0xffff +; HSA-DAG: s_lshr_b32 [[Y:s[0-9]+]], [[XY]], 16 +; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]] +; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] +; GCN: v_mad_u32_u24 [[VAL:v[0-9]+]], [[X]], [[VY]], [[VZ]] +; GCN: buffer_store_dword [[VAL]] +define void @local_size_xyz (i32 addrspace(1)* %out) { +entry: + %x = call i32 @llvm.r600.read.local.size.x() #0 + %y = call i32 @llvm.r600.read.local.size.y() #0 + %z = call i32 @llvm.r600.read.local.size.z() #0 + %xy = mul i32 %x, %y + %xyz = add i32 %xy, %z + store i32 %xyz, i32 addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}get_work_dim: ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]] ; EG: MOV [[VAL]], KC0[2].Z -; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb -; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] -; GCN: buffer_store_dword [[VVAL]] +; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb +; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @get_work_dim (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.AMDGPU.read.workdim() #0 @@ -157,8 +260,8 @@ ; kernel arguments, but this may change in the future. ; FUNC-LABEL: {{^}}tgid_x: -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s4 -; GCN: buffer_store_dword [[VVAL]] +; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s4 +; GCN-NOHSA: buffer_store_dword [[VVAL]] define void @tgid_x (i32 addrspace(1)* %out) { entry: %0 = call i32 @llvm.r600.read.tgid.x() #0