Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -241,12 +241,6 @@ "Force using DS instruction immediate offsets on SI" >; -def FeatureIfCvt : SubtargetFeature <"disable-ifcvt", - "EnableIfCvt", - "false", - "Disable the if conversion pass" ->; - def FeatureEnableSIScheduler : SubtargetFeature<"si-scheduler", "EnableSIScheduler", "true", Index: lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -124,6 +124,10 @@ if (!TM || skipFunction(F)) return false; + const AMDGPUSubtarget &ST = TM->getSubtarget(F); + if (!ST.isPromoteAllocaEnabled()) + return false; + FunctionType *FTy = F.getFunctionType(); // If the function has any arguments in the local address space, then it's @@ -139,8 +143,6 @@ } } - const AMDGPUSubtarget &ST = TM->getSubtarget(F); - LocalMemLimit = ST.getLocalMemorySize(); if (LocalMemLimit == 0) return false; Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -82,7 +82,6 @@ // Used as options. bool EnableVGPRSpilling; bool EnablePromoteAlloca; - bool EnableIfCvt; bool EnableLoadStoreOpt; bool EnableUnsafeDSOffsetFolding; bool EnableSIScheduler; @@ -222,10 +221,6 @@ return EnablePromoteAlloca; } - bool isIfCvtEnabled() const { - return EnableIfCvt; - } - bool unsafeDSOffsetFoldingEnabled() const { return EnableUnsafeDSOffsetFolding; } Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -105,7 +105,6 @@ EnableVGPRSpilling(false), EnablePromoteAlloca(false), - EnableIfCvt(true), EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false), EnableSIScheduler(false), Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -45,6 +45,18 @@ cl::desc("Use StructurizeCFG IR pass"), cl::init(true)); +static cl::opt EnableSROA( + "amdgpu-sroa", + cl::desc("Run SROA after promote alloca pass"), + cl::ReallyHidden, + cl::init(true)); + +static cl::opt EnableR600IfConvert( + "r600-if-convert", + cl::desc("Use if conversion pass"), + cl::ReallyHidden, + cl::init(true)); + extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(TheAMDGPUTarget); @@ -212,12 +224,7 @@ } ScheduleDAGInstrs * - createMachineScheduler(MachineSchedContext *C) const override { - const SISubtarget *ST = getGCNTargetMachine().getSubtargetImpl(); - if (ST->enableSIScheduler()) - return createSIMachineScheduler(C); - return nullptr; - } + createMachineScheduler(MachineSchedContext *C) const override; bool addPreISel() override; void addMachineSSAOptimization() override; @@ -285,10 +292,11 @@ addPass(createAMDGPUOpenCLImageTypeLoweringPass()); const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); - const AMDGPUSubtarget &ST = *TM.getSubtargetImpl(); - if (TM.getOptLevel() > CodeGenOpt::None && ST.isPromoteAllocaEnabled()) { + if (TM.getOptLevel() > CodeGenOpt::None) { addPass(createAMDGPUPromoteAlloca(&TM)); - addPass(createSROAPass()); + + if (EnableSROA) + addPass(createSROAPass()); } addStraightLineScalarOptimizationPasses(); @@ -344,9 +352,8 @@ } void R600PassConfig::addPreSched2() { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); addPass(createR600EmitClauseMarkers(), false); - if (ST.isIfCvtEnabled()) + if (EnableR600IfConvert) addPass(&IfConverterID, false); addPass(createR600ClauseMergePass(*TM), false); } @@ -367,6 +374,14 @@ // GCN Pass Setup //===----------------------------------------------------------------------===// +ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( + MachineSchedContext *C) const { + const SISubtarget &ST = C->MF->getSubtarget(); + if (ST.enableSIScheduler()) + return createSIMachineScheduler(C); + return nullptr; +} + bool GCNPassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); @@ -415,8 +430,6 @@ #endif void GCNPassConfig::addPreRegAlloc() { - const SISubtarget &ST = *getGCNTargetMachine().getSubtargetImpl(); - // This needs to be run directly before register allocation because // earlier passes might recompute live intervals. // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass @@ -424,15 +437,18 @@ insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); } - if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) { + if (getOptLevel() > CodeGenOpt::None) { // Don't do this with no optimizations since it throws away debug info by // merging nonadjacent loads. // This should be run after scheduling, but before register allocation. It // also need extra copies to the address operand to be eliminated. + + // FIXME: Move pre-RA and remove extra reg coalescer run. insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); insertPass(&MachineSchedulerID, &RegisterCoalescerID); } + addPass(createSIShrinkInstructionsPass()); addPass(createSIWholeQuadModePass()); } Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -412,6 +412,9 @@ return false; const SISubtarget &STM = MF.getSubtarget(); + if (!STM.loadStoreOptEnabled()) + return false; + TII = STM.getInstrInfo(); TRI = &TII->getRegisterInfo(); Index: test/CodeGen/AMDGPU/captured-frame-index.ll =================================================================== --- test/CodeGen/AMDGPU/captured-frame-index.ll +++ test/CodeGen/AMDGPU/captured-frame-index.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}stored_fi_to_lds: ; GCN: s_load_dword [[LDSPTR:s[0-9]+]] Index: test/CodeGen/AMDGPU/cgp-addressing-modes.ll =================================================================== --- test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -1,9 +1,9 @@ ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI %s ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s -; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; OPT-LABEL: @test_sink_global_small_offset_i32( ; OPT-CI-NOT: getelementptr i32, i32 addrspace(1)* %in Index: test/CodeGen/AMDGPU/extload-private.ll =================================================================== --- test/CodeGen/AMDGPU/extload-private.ll +++ test/CodeGen/AMDGPU/extload-private.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}load_i8_sext_private: ; SI: buffer_load_sbyte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen @@ -39,7 +39,7 @@ define void @load_i16_zext_private(i32 addrspace(1)* %out) { entry: %tmp0 = alloca i16 - %tmp1 = load i16, i16* %tmp0 + %tmp1 = load volatile i16, i16* %tmp0 %tmp2 = zext i16 %tmp1 to i32 store i32 %tmp2, i32 addrspace(1)* %out ret void Index: test/CodeGen/AMDGPU/parallelandifcollapse.ll =================================================================== --- test/CodeGen/AMDGPU/parallelandifcollapse.ll +++ test/CodeGen/AMDGPU/parallelandifcollapse.ll @@ -1,5 +1,4 @@ -; Function Attrs: nounwind -; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca < %s | FileCheck %s +; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck %s ; ; CFG flattening should use parallel-and mode to generate branch conditions and ; then merge if-regions with the same bodies. Index: test/CodeGen/AMDGPU/structurize1.ll =================================================================== --- test/CodeGen/AMDGPU/structurize1.ll +++ test/CodeGen/AMDGPU/structurize1.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=r600 -mattr=disable-ifcvt -mcpu=redwood | FileCheck %s +; RUN: llc -march=r600 -mcpu=redwood -r600-if-convert=0 < %s | FileCheck %s ; This tests for abug where the AMDILCFGStructurizer was crashing on loops ; like this: