Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -108,6 +108,11 @@ "true", "Force using DS instruction immediate offsets on SI">; +def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global", + "FlatForGlobal", + "true", + "Force to generate flat instruction for global">; + def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", "FlatAddressSpace", "true", @@ -232,7 +237,7 @@ def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS", [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace, - FeatureGCN1Encoding, FeatureCIInsts]>; + FeatureFlatForGlobal, FeatureGCN1Encoding, FeatureCIInsts]>; def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -414,7 +414,7 @@ } } - if (VCCUsed) + if (VCCUsed || FlatUsed) MaxSGPR += 2; if (FlatUsed) Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -993,7 +993,9 @@ SDValue Ptr, Offen, Idxen, Addr64; // addr64 bit was removed for volcanic islands. - if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + // Or Subtarget prefers to use flat instruction. + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS || + Subtarget->useFlatForGlobal()) return false; SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, @@ -1059,6 +1061,11 @@ SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, SDValue &TFE) const { + + // Subtarget prefers to use flat instruction. + if (Subtarget->useFlatForGlobal()) + return false; + SDValue Ptr, VAddr, Offen, Idxen, Addr64; const SIInstrInfo *TII = static_cast(Subtarget->getInstrInfo()); Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -70,6 +70,7 @@ bool FastFMAF32; bool CaymanISA; bool FlatAddressSpace; + bool FlatForGlobal; bool EnableIRStructurizer; bool EnablePromoteAlloca; bool EnableIfCvt; @@ -159,6 +160,10 @@ return FlatAddressSpace; } + bool useFlatForGlobal() const { + return FlatForGlobal; + } + bool hasBFE() const { return (getGeneration() >= EVERGREEN); } Index: lib/Target/AMDGPU/VIInstructions.td =================================================================== --- lib/Target/AMDGPU/VIInstructions.td +++ lib/Target/AMDGPU/VIInstructions.td @@ -105,7 +105,11 @@ // SMEM Patterns //===----------------------------------------------------------------------===// -let Predicates = [isVI] in { +def useFlatForGlobal : Predicate < + "Subtarget->useFlatForGlobal() || " + "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">; + +let Predicates = [useFlatForGlobal] in { // 1. Offset as 20bit DWORD immediate def : Pat < Index: test/CodeGen/AMDGPU/ci-use-flat-for-global.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/ci-use-flat-for-global.ll @@ -0,0 +1,52 @@ +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck %s + + +; CHECK-NOT: buffer_store_dword +; CHECK: flat_store_dword +define spir_kernel void @test_store(i32 %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) #0 { +bb: + %tmp = tail call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #2 + %tmp7 = getelementptr i8, i8 addrspace(2)* %tmp, i64 4 + %tmp8 = bitcast i8 addrspace(2)* %tmp7 to i32 addrspace(2)* + %tmp9 = load i32, i32 addrspace(2)* %tmp8, align 4 + %tmp10 = and i32 %tmp9, 65535 + %tmp11 = tail call i32 @llvm.r600.read.tgid.x() #2 + %tmp12 = tail call i32 @llvm.r600.read.tidig.x() #2 + %tmp13 = mul i32 %tmp11, %tmp10 + %tmp14 = add i32 %tmp12, %tmp13 + %tmp15 = lshr i32 %tmp9, 16 + %tmp16 = tail call i32 @llvm.r600.read.tgid.y() #2 + %tmp17 = tail call i32 @llvm.r600.read.tidig.y() #2 + %tmp18 = mul i32 %tmp16, %tmp15 + %tmp19 = add i32 %tmp17, %tmp18 + %tmp20 = icmp eq i32 %arg, 0 + %tmp21 = mul nsw i32 %tmp19, %arg2 + %tmp22 = add nsw i32 %tmp14, %tmp21 + %tmp23 = sext i32 %tmp22 to i64 + %tmp24 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp23 + br i1 %tmp20, label %bb26, label %bb25 + +bb25: ; preds = %bb + store i32 3, i32 addrspace(1)* %tmp24, align 4 + br label %bb26 + +bb26: ; preds = %bb25, %bb + store i32 1, i32 addrspace(1)* %tmp24, align 4 + ret void +} + +; Function Attrs: nounwind readnone +declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.y() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.y() #1 +