Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -108,6 +108,11 @@ "true", "Force using DS instruction immediate offsets on SI">; +def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global", + "FlatForGlobal", + "true", + "Force to generate flat instruction for global">; + def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", "FlatAddressSpace", "true", Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -95,7 +95,7 @@ bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, SDValue &Offset1) const; - void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, + bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, SDValue &TFE) const; @@ -920,12 +920,16 @@ return isUInt<12>(Imm->getZExtValue()); } -void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, +bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, SDValue &TFE) const { + // Subtarget prefers to use flat instruction + if (Subtarget->useFlatForGlobal()) + return false; + SDLoc DL(Addr); GLC = CurDAG->getTargetConstant(0, DL, MVT::i1); @@ -958,14 +962,14 @@ if (isLegalMUBUFImmOffset(C1)) { Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); - return; + return true; } else if (isUInt<32>(C1->getZExtValue())) { // Illegal offset, store it in soffset. Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)), 0); - return; + return true; } } @@ -977,13 +981,15 @@ Ptr = N0; VAddr = N1; Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); - return; + return true; } // default case -> offset VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); Ptr = Addr; Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + + return true; } bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, @@ -996,8 +1002,9 @@ if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) return false; - SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE); + if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, + GLC, SLC, TFE)) + return false; ConstantSDNode *C = cast(Addr64); if (C->getSExtValue()) { @@ -1063,8 +1070,9 @@ const SIInstrInfo *TII = static_cast(Subtarget->getInstrInfo()); - SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE); + if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, + GLC, SLC, TFE)) + return false; if (!cast(Offen)->getSExtValue() && !cast(Idxen)->getSExtValue() && Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -70,6 +70,7 @@ bool FastFMAF32; bool CaymanISA; bool FlatAddressSpace; + bool FlatForGlobal; bool EnableIRStructurizer; bool EnablePromoteAlloca; bool EnableIfCvt; @@ -159,6 +160,10 @@ return FlatAddressSpace; } + bool useFlatForGlobal() const { + return FlatForGlobal; + } + bool hasBFE() const { return (getGeneration() >= EVERGREEN); } Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -68,9 +68,9 @@ DumpCode(false), R600ALUInst(false), HasVertexCache(false), TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false), FP64Denormals(false), FP32Denormals(false), FastFMAF32(false), - CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true), - EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false), - EnableUnsafeDSOffsetFolding(false), + CaymanISA(false), FlatAddressSpace(false), FlatForGlobal(false), + EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true), + EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false), WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false), GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0), @@ -99,6 +99,9 @@ MaxStackAlign, 0)); } + + if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS && isAmdHsaOS()) + FlatForGlobal = true; } unsigned AMDGPUSubtarget::getStackEntrySize() const { Index: lib/Target/AMDGPU/CIInstructions.td =================================================================== --- lib/Target/AMDGPU/CIInstructions.td +++ lib/Target/AMDGPU/CIInstructions.td @@ -33,6 +33,8 @@ def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">; + + //===----------------------------------------------------------------------===// // VOP1 Instructions //===----------------------------------------------------------------------===// @@ -234,3 +236,64 @@ >; } // End Predicates = [isCI] + + +//===----------------------------------------------------------------------===// +// Patterns to generate flat for global +//===----------------------------------------------------------------------===// + +def useFlatForGlobal : Predicate < + "Subtarget->useFlatForGlobal() || " + "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">; + +let Predicates = [useFlatForGlobal] in { + +// 1. Offset as 20bit DWORD immediate +def : Pat < + (SIload_constant v4i32:$sbase, IMM20bit:$offset), + (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset)) +>; + +// Patterns for global loads with no offset +class FlatLoadPat : Pat < + (vt (node i64:$addr)), + (inst $addr, 0, 0, 0) +>; + +def : FlatLoadPat ; +def : FlatLoadPat ; +def : FlatLoadPat ; +def : FlatLoadPat ; +def : FlatLoadPat ; +def : FlatLoadPat ; +def : FlatLoadPat ; + +class FlatStorePat : Pat < + (node vt:$data, i64:$addr), + (inst $data, $addr, 0, 0, 0) +>; + +def : FlatStorePat ; +def : FlatStorePat ; +def : FlatStorePat ; +def : FlatStorePat ; +def : FlatStorePat ; + +class FlatAtomicPat : Pat < + (vt (node i64:$addr, vt:$data)), + (inst $addr, $data, 0, 0) +>; + +def : FlatAtomicPat ; +def : FlatAtomicPat ; +def : FlatAtomicPat ; +def : FlatAtomicPat ; +def : FlatAtomicPat ; +def : FlatAtomicPat ; +def : FlatAtomicPat ; +def : FlatAtomicPat ; +def : FlatAtomicPat ; +def : FlatAtomicPat ; + + +} // End Predicates = [useFlatForGlobal] Index: lib/Target/AMDGPU/VIInstructions.td =================================================================== --- lib/Target/AMDGPU/VIInstructions.td +++ lib/Target/AMDGPU/VIInstructions.td @@ -101,58 +101,3 @@ } // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI -//===----------------------------------------------------------------------===// -// SMEM Patterns -//===----------------------------------------------------------------------===// - -let Predicates = [isVI] in { - -// 1. Offset as 20bit DWORD immediate -def : Pat < - (SIload_constant v4i32:$sbase, IMM20bit:$offset), - (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset)) ->; - -// Patterns for global loads with no offset -class FlatLoadPat : Pat < - (vt (node i64:$addr)), - (inst $addr, 0, 0, 0) ->; - -def : FlatLoadPat ; -def : FlatLoadPat ; -def : FlatLoadPat ; -def : FlatLoadPat ; -def : FlatLoadPat ; -def : FlatLoadPat ; -def : FlatLoadPat ; - -class FlatStorePat : Pat < - (node vt:$data, i64:$addr), - (inst $data, $addr, 0, 0, 0) ->; - -def : FlatStorePat ; -def : FlatStorePat ; -def : FlatStorePat ; -def : FlatStorePat ; -def : FlatStorePat ; - -class FlatAtomicPat : Pat < - (vt (node i64:$addr, vt:$data)), - (inst $addr, $data, 0, 0) ->; - -def : FlatAtomicPat ; -def : FlatAtomicPat ; -def : FlatAtomicPat ; -def : FlatAtomicPat ; -def : FlatAtomicPat ; -def : FlatAtomicPat ; -def : FlatAtomicPat ; -def : FlatAtomicPat ; -def : FlatAtomicPat ; -def : FlatAtomicPat ; - - -} // End Predicates = [isVI] Index: test/CodeGen/AMDGPU/ci-use-flat-for-global.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/ci-use-flat-for-global.ll @@ -0,0 +1,51 @@ +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck -check-prefix=CI-HSA %s +; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri | FileCheck -check-prefix=CI-NOHSA %s + +; CI-NOHSA: buffer_store_dword +; CI-HSA: flat_store_dword +define spir_kernel void @test_store(i32 %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) #0 { +bb: + %tmp = tail call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #2 + %tmp7 = getelementptr i8, i8 addrspace(2)* %tmp, i64 4 + %tmp8 = bitcast i8 addrspace(2)* %tmp7 to i32 addrspace(2)* + %tmp9 = load i32, i32 addrspace(2)* %tmp8, align 4 + %tmp10 = and i32 %tmp9, 65535 + %tmp11 = tail call i32 @llvm.r600.read.tgid.x() #2 + %tmp12 = tail call i32 @llvm.r600.read.tidig.x() #2 + %tmp13 = mul i32 %tmp11, %tmp10 + %tmp14 = add i32 %tmp12, %tmp13 + %tmp15 = lshr i32 %tmp9, 16 + %tmp16 = tail call i32 @llvm.r600.read.tgid.y() #2 + %tmp17 = tail call i32 @llvm.r600.read.tidig.y() #2 + %tmp18 = mul i32 %tmp16, %tmp15 + %tmp19 = add i32 %tmp17, %tmp18 + %tmp20 = icmp eq i32 %arg, 0 + %tmp21 = mul nsw i32 %tmp19, %arg2 + %tmp22 = add nsw i32 %tmp14, %tmp21 + %tmp23 = sext i32 %tmp22 to i64 + %tmp24 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp23 + br i1 %tmp20, label %bb26, label %bb25 + +bb25: ; preds = %bb + store i32 3, i32 addrspace(1)* %tmp24, align 4 + br label %bb26 + +bb26: ; preds = %bb25, %bb + store i32 1, i32 addrspace(1)* %tmp24, align 4 + ret void +} + +; Function Attrs: nounwind readnone +declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.y() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.y() #1