Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -254,6 +254,9 @@ MVT getFenceOperandTy(const DataLayout &DL) const override { return MVT::i32; } + + bool isMemOpUniform(const SDNode *N) const; + bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const; }; namespace AMDGPUISD { Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2480,9 +2480,9 @@ unsigned Size = VT.getStoreSize(); unsigned Align = LN->getAlignment(); + unsigned AS = LN->getAddressSpace(); if (Align < Size && isTypeLegal(VT)) { bool IsFast; - unsigned AS = LN->getAddressSpace(); // Expand unaligned loads earlier than legalization. Due to visitation order // problems during legalization, the emitted instructions to pack and unpack @@ -2500,6 +2500,77 @@ return SDValue(); } + // Create DWORDX3 loads. We cannot create it later because legalizer will + // split it and there is no way to specify custom lowering. + bool IsGlobal = (AS == AMDGPUASI.GLOBAL_ADDRESS); + bool IsConstant = (AS == AMDGPUASI.CONSTANT_ADDRESS); + bool IsGlobalOrConstant = IsGlobal || IsConstant; + // TODO: support vec3 stores and move the logig of this condition into + // shouldCombineMemoryType(). + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS && + VT.isExtended() && VT.isVector() && VT.getVectorNumElements() == 3 && + // There are no sub-dword vector loads. + VT.getVectorElementType().getStoreSize() == 4 && + // There are no vector extloads. + LN->getExtensionType() == ISD::LoadExtType::NON_EXTLOAD && + ((Subtarget->useFlatForGlobal() && IsGlobalOrConstant) || + AS == AMDGPUASI.FLAT_ADDRESS) && + // Uniform const loads will be selected to scalar loads, which do not have + // DWORDX3 form. + !((IsConstant || (IsGlobal && Subtarget->getScalarizeGlobalBehavior() && + isMemOpHasNoClobberedMemOperand(LN))) && + isMemOpUniform(LN))) { + SDValue ZeroFlag = DAG.getTargetConstant(0, SL, MVT::i1); // GLC/SLC + SDValue Ptr = LN->getBasePtr(); + SDValue Offset = LN->getOffset(); + + int64_t OffVal = 0; + if (auto OffC = dyn_cast(Offset)) + OffVal = OffC->getSExtValue(); + // GFX9: Imm offset: Scratch, Global: 13-bit signed byte offset + // FLAT: 12-bit unsigned offset (MSB is ignored) + // TODO: It does not seem to be possible to get any offset after + // SelectionDAGBuilder. + if ((OffVal && (!Subtarget->hasFlatInstOffsets() || + (IsGlobalOrConstant && !isInt<13>(OffVal)) || + !isUInt<12>(OffVal))) || + // Is that possible to get non-constant offset recorded in LoadSDNode? + (!OffVal && !Offset.isUndef())) { + Ptr = DAG.getNode(ISD::ADD, SL, Ptr.getValueType(), Ptr, Offset); + OffVal = 0; + } + Offset = DAG.getTargetConstant(OffVal, SL, MVT::i16); + + // TODO: introduce AMDGPUISD::LOAD3 returning v4i32 and select it later + // to allow proper non-constant offset folding with GFX9 flat/global + // instructions and with buffer_load_dwordx3. + // That is in case if we are interested in supporting MUBUF or + // VGPR offsets with SGPR base on GFX9. Both are unclear. + // However, SelectionDAGBuilder does not really record an offset + // even if constant, so we still want to get that constant offset + // and we do not want to replicate SelectADDR/MUBUFOffset code here. + unsigned Opc = AMDGPU::FLAT_LOAD_DWORDX3; + + if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9 && + IsGlobalOrConstant) + Opc = AMDGPU::GLOBAL_LOAD_DWORDX3; + + // We must return a legal v4 type because DAG legalizer cannot widen machine + // nodes results, but knowns how to widen BUILD_VECTOR. + EVT V4VT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4); + auto NewLoad = DAG.getMachineNode(Opc, SL, V4VT, N->getValueType(1), + { Ptr, Offset, ZeroFlag, ZeroFlag }); + + auto MMOs = DAG.getMachineFunction().allocateMemRefsArray(1); + *MMOs = LN->getMemOperand(); + NewLoad->setMemRefs(MMOs, MMOs + 1); + + SmallVector Elts; + DAG.ExtractVectorElements(SDValue(NewLoad, 0), Elts, 0, 3); + SDValue V3 = DAG.getBuildVector(VT, SL, { Elts[0], Elts[1], Elts[2] }); + return DAG.getMergeValues({ V3, SDValue(NewLoad, 1) }, SL); + } + if (!shouldCombineMemoryType(VT)) return SDValue(); @@ -3793,3 +3864,17 @@ return 1; } } + +bool AMDGPUTargetLowering::isMemOpUniform(const SDNode *N) const { + const MemSDNode *MemNode = cast(N); + + return AMDGPU::isUniformMMO(MemNode->getMemOperand()); +} + +bool AMDGPUTargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) + const { + const MemSDNode *MemNode = cast(N); + const Value *Ptr = MemNode->getMemOperand()->getValue(); + const Instruction *I = dyn_cast(Ptr); + return I && I->getMetadata("amdgpu.noclobber"); +} Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -166,8 +166,6 @@ bool MemcpyStrSrc, MachineFunction &MF) const override; - bool isMemOpUniform(const SDNode *N) const; - bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const; bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; bool isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -819,13 +819,6 @@ isFlatGlobalAddrSpace(DestAS, AMDGPUASI); } -bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const { - const MemSDNode *MemNode = cast(N); - const Value *Ptr = MemNode->getMemOperand()->getValue(); - const Instruction *I = dyn_cast(Ptr); - return I && I->getMetadata("amdgpu.noclobber"); -} - bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { // Flat -> private/local is a simple truncate. @@ -836,12 +829,6 @@ return isNoopAddrSpaceCast(SrcAS, DestAS); } -bool SITargetLowering::isMemOpUniform(const SDNode *N) const { - const MemSDNode *MemNode = cast(N); - - return AMDGPU::isUniformMMO(MemNode->getMemOperand()); -} - TargetLoweringBase::LegalizeTypeAction SITargetLowering::getPreferredVectorAction(EVT VT) const { if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) Index: test/CodeGen/AMDGPU/load-global-f32.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-f32.ll +++ test/CodeGen/AMDGPU/load-global-f32.ll @@ -31,7 +31,7 @@ ; FUNC-LABEL: {{^}}global_load_v3f32: ; GCN-NOHSA: buffer_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx3 ; R600: VTX_READ_128 define amdgpu_kernel void @global_load_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { Index: test/CodeGen/AMDGPU/load-global-i32.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-i32.ll +++ test/CodeGen/AMDGPU/load-global-i32.ll @@ -30,7 +30,7 @@ ; FUNC-LABEL: {{^}}global_load_v3i32: ; GCN-NOHSA: buffer_load_dwordx4 -; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx3 ; EG: VTX_READ_128 define amdgpu_kernel void @global_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in) #0 { Index: test/CodeGen/AMDGPU/load-vec3.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/load-vec3.ll @@ -0,0 +1,110 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-MUBUF %s +; RUN: llc -march=amdgcn -mcpu=gfx901 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s + +; GCN-LABEL: {{^}}load_global_v3i32: +; VI: flat_load_dwordx3 +; GFX9: global_load_dwordx3 +; GCN-MUBUF-DAG: buffer_load_dwordx2 v +; GCN-MUBUF-DAG: buffer_load_dword v +define amdgpu_kernel void @load_global_v3i32(float addrspace(1)* nocapture readonly %in, <3 x float> addrspace(1)* nocapture %out) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep_in = getelementptr inbounds float, float addrspace(1)* %in, i32 %id + %gep_in_v3 = bitcast float addrspace(1)* %gep_in to <3 x i32> addrspace(1)* + %load = load <3 x i32>, <3 x i32> addrspace(1)* %gep_in_v3, align 4 + %gep_out = getelementptr inbounds <3 x float>, <3 x float> addrspace(1)* %out, i32 %id + %vec3i = bitcast <3 x i32> %load to <3 x float> + store <3 x float> %vec3i, <3 x float> addrspace(1)* %gep_out, align 16 + ret void +} + +; GCN-LABEL: {{^}}load_global_v3f32: +; VI: flat_load_dwordx3 +; GFX9: global_load_dwordx3 +; GCN-MUBUF-DAG: buffer_load_dwordx2 v +; GCN-MUBUF-DAG: buffer_load_dword v +define amdgpu_kernel void @load_global_v3f32(float addrspace(1)* nocapture readonly %in, <3 x float> addrspace(1)* nocapture %out) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep_in = getelementptr inbounds float, float addrspace(1)* %in, i32 %id + %gep_in_v3 = bitcast float addrspace(1)* %gep_in to <3 x float> addrspace(1)* + %load = load <3 x float>, <3 x float> addrspace(1)* %gep_in_v3, align 4 + %val = fadd <3 x float> %load, %load + %gep_out = getelementptr inbounds <3 x float>, <3 x float> addrspace(1)* %out, i32 %id + store <3 x float> %val, <3 x float> addrspace(1)* %gep_out, align 16 + ret void +} + +; GCN-LABEL: {{^}}load_constant_v3i32: +; VI: flat_load_dwordx3 +; GFX9: global_load_dwordx3 +; GCN-MUBUF-DAG: buffer_load_dwordx2 v +; GCN-MUBUF-DAG: buffer_load_dword v +define amdgpu_kernel void @load_constant_v3i32(i32 addrspace(2)* nocapture readonly %in, <3 x i32> addrspace(1)* nocapture %out) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep_in = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %id + %gep_in_v3 = bitcast i32 addrspace(2)* %gep_in to <3 x i32> addrspace(2)* + %load = load <3 x i32>, <3 x i32> addrspace(2)* %gep_in_v3, align 4 + %gep_out = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %id + store <3 x i32> %load, <3 x i32> addrspace(1)* %gep_out, align 16 + ret void +} + +; GCN-LABEL: {{^}}load_flat_v3i32: +; GCN: flat_load_dwordx3 +define amdgpu_kernel void @load_flat_v3i32(i32 addrspace(4)* nocapture readonly %in, <3 x i32> addrspace(1)* nocapture %out) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep_in = getelementptr inbounds i32, i32 addrspace(4)* %in, i32 %id + %gep_in_v3 = bitcast i32 addrspace(4)* %gep_in to <3 x i32> addrspace(4)* + %load = load <3 x i32>, <3 x i32> addrspace(4)* %gep_in_v3, align 4 + %gep_out = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %id + store <3 x i32> %load, <3 x i32> addrspace(1)* %gep_out, align 16 + ret void +} + +; GCN-LABEL: {{^}}load_global_v3f16: +; GCN: {{buffer|flat|global}}_load_ushort v +; GCN: {{buffer|flat|global}}_load_ushort v +; GCN: {{buffer|flat|global}}_load_ushort v +; GCN-NOT: load_dwordx3 +define amdgpu_kernel void @load_global_v3f16(half addrspace(1)* nocapture readonly %in, <3 x half> addrspace(1)* nocapture %out) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep_in = getelementptr inbounds half, half addrspace(1)* %in, i32 %id + %gep_in_v3 = bitcast half addrspace(1)* %gep_in to <3 x half> addrspace(1)* + %load = load <3 x half>, <3 x half> addrspace(1)* %gep_in_v3, align 2 + %val = fadd <3 x half> %load, %load + %gep_out = getelementptr inbounds <3 x half>, <3 x half> addrspace(1)* %out, i32 %id + store <3 x half> %val, <3 x half> addrspace(1)* %gep_out, align 8 + ret void +} + +; GCN-LABEL: {{^}}load_global_v3i16_to_v3i32: +; GCN: {{buffer|flat|global}}_load_ushort v +; GCN: {{buffer|flat|global}}_load_ushort v +; GCN: {{buffer|flat|global}}_load_ushort v +; GCN-NOT: load_dwordx3 +define amdgpu_kernel void @load_global_v3i16_to_v3i32(i16 addrspace(1)* nocapture readonly %in, <3 x i32> addrspace(1)* nocapture %out) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep_in = getelementptr inbounds i16, i16 addrspace(1)* %in, i32 %id + %gep_in_v3 = bitcast i16 addrspace(1)* %gep_in to <3 x i16> addrspace(1)* + %load = load <3 x i16>, <3 x i16> addrspace(1)* %gep_in_v3, align 2 + %val = zext <3 x i16> %load to <3 x i32> + %gep_out = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %id + store <3 x i32> %val, <3 x i32> addrspace(1)* %gep_out, align 8 + ret void +} + +; GCN-LABEL: {{^}}load_global_v3i32_scalar: +; GCN-DAG: s_load_dwordx2 s[{{[0-9:]+}}], s[{{[0-9:]+}}], 0x0 +; GCN-DAG: s_load_dword s{{[0-9]+}}, s[{{[0-9:]+}}], 0x{{2|8}} +; GCN-NOT: load_dwordx3 +define amdgpu_kernel void @load_global_v3i32_scalar(float addrspace(1)* nocapture readonly %in, <3 x i32> addrspace(1)* nocapture %out) { + %gep_in = getelementptr inbounds float, float addrspace(1)* %in, i32 0 + %gep_in_v3 = bitcast float addrspace(1)* %gep_in to <3 x i32> addrspace(1)* + %load = load <3 x i32>, <3 x i32> addrspace(1)* %gep_in_v3, align 4 + store <3 x i32> %load, <3 x i32> addrspace(1)* %out, align 16 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #1 = { nounwind readnone speculatable }