diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -730,14 +730,26 @@ auto Reg = Dep.getReg(); MachineBasicBlock::const_instr_iterator I(SrcI->getIterator()); MachineBasicBlock::const_instr_iterator E(SrcI->getParent()->instr_end()); + unsigned Lat = 0; for (++I; I != E && I->isBundledWithPred(); ++I) { - if (!I->modifiesRegister(Reg, TRI)) - continue; - Dep.setLatency(InstrInfo.getInstrLatency(getInstrItineraryData(), *I)); - break; + if (I->modifiesRegister(Reg, TRI)) + Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); + else if (Lat) + --Lat; } + Dep.setLatency(Lat); } else if (DstI->isBundle()) { - Dep.setLatency(InstrInfo.getInstrLatency(getInstrItineraryData(), *SrcI)); + const SIRegisterInfo *TRI = getRegisterInfo(); + auto Reg = Dep.getReg(); + MachineBasicBlock::const_instr_iterator I(DstI->getIterator()); + MachineBasicBlock::const_instr_iterator E(DstI->getParent()->instr_end()); + unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *SrcI); + for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { + if (I->readsRegister(Reg, TRI)) + break; + --Lat; + } + Dep.setLatency(Lat); } } diff --git a/llvm/test/CodeGen/AMDGPU/bundle-latency.mir b/llvm/test/CodeGen/AMDGPU/bundle-latency.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/bundle-latency.mir @@ -0,0 +1,44 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=post-RA-sched %s -o - | FileCheck -check-prefix=GCN %s + +# Check that we move consumer further from producer, even if one of them is in a bundle. + +--- +name: src_bundle_latency +tracksRegLiveness: true +body: | + bb.0: + ; GCN-LABEL: name: src_bundle_latency + ; GCN: $vgpr0, $vgpr1 = BUNDLE undef $vgpr3_vgpr4, implicit $exec { + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, implicit $exec + ; GCN: } + ; GCN: $vgpr6 = V_ADD_F32_e32 killed $vgpr0, $vgpr0, implicit $exec + ; GCN: $vgpr5 = V_ADD_F32_e32 killed $vgpr1, $vgpr1, implicit $exec + $vgpr0, $vgpr1 = BUNDLE undef $vgpr3_vgpr4, implicit $exec { + $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, implicit $exec + } + $vgpr5 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $exec + $vgpr6 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $exec +... + +--- +name: dst_bundle_latency +tracksRegLiveness: true +body: | + bb.0: + ; GCN-LABEL: name: dst_bundle_latency + ; GCN: $vgpr1 = V_ADD_F32_e32 undef $vgpr6, undef $vgpr6, implicit $exec + ; GCN: $vgpr0 = V_ADD_F32_e32 undef $vgpr5, undef $vgpr5, implicit $exec + ; GCN: BUNDLE killed $vgpr0, killed $vgpr1, undef $vgpr3_vgpr4, implicit $exec { + ; GCN: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, killed $vgpr1, 0, 0, 0, 0, implicit $exec + ; GCN: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, killed $vgpr0, 4, 0, 0, 0, implicit $exec + ; GCN: } + $vgpr0 = V_ADD_F32_e32 undef $vgpr5, undef $vgpr5, implicit $exec + $vgpr1 = V_ADD_F32_e32 undef $vgpr6, undef $vgpr6, implicit $exec + BUNDLE $vgpr0, $vgpr1, undef $vgpr3_vgpr4, implicit $exec { + GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec + } +...