diff --git a/llvm/lib/CodeGen/MachineCSE.cpp b/llvm/lib/CodeGen/MachineCSE.cpp --- a/llvm/lib/CodeGen/MachineCSE.cpp +++ b/llvm/lib/CodeGen/MachineCSE.cpp @@ -433,6 +433,11 @@ MachineBasicBlock *CSBB, MachineInstr *MI) { // FIXME: Heuristics that works around the lack the live range splitting. + MachineBasicBlock *BB = MI->getParent(); + // Prevent CSE-ing non-local convergent instructions. + if (MI->isConvergent() && CSBB != BB) + return false; + // If CSReg is used at all uses of Reg, CSE should not increase register // pressure of CSReg. bool MayIncreasePressure = true; @@ -455,7 +460,6 @@ // an immediate predecessor. We don't want to increase register pressure and // end up causing other computation to be spilled. if (TII->isAsCheapAsAMove(*MI)) { - MachineBasicBlock *BB = MI->getParent(); if (CSBB != BB && !CSBB->isSuccessor(BB)) return false; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/no-cse-nonlocal-convergent-instrs.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/no-cse-nonlocal-convergent-instrs.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/no-cse-nonlocal-convergent-instrs.mir @@ -0,0 +1,73 @@ +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -o - -run-pass=machine-cse %s | FileCheck %s + +# Check that we don't CSE non-local convergent instrs. Otherwise, reusing defs +# of convergent instrs from different control flow scopes can cause illegal +# codegen. Previously, the swizzle in bb2 would be CSE-ed in favor of using the +# swizzle in bb1 despite bb2 being a different control flow scope. + +# CHECK-LABEL: name: no_cse +# CHECK: bb.1.if.then +# CHECK: [[SWIZZLE1:%[0-9]+]]:vgpr_32 = DS_SWIZZLE_B32 [[SRC:%[0-9]+]], 100, 0, implicit $exec +# CHECK-NEXT: V_ADD_CO_U32_e64 [[SWIZZLE1]], {{%[0-9]+}}, 0, implicit $exec +# CHECK-NEXT: S_CMP_LT_I32 {{.*}} implicit-def $scc +# CHECK-NEXT: S_CBRANCH_SCC1 %bb.3, implicit $scc +# CHECK-NEXT: S_BRANCH %bb.2 +# CHECK: bb.2.if.then.if.then +# CHECK: [[SWIZZLE2:%[0-9]+]]:vgpr_32 = DS_SWIZZLE_B32 [[SRC]], 100, 0, implicit $exec +# CHECK-NEXT: V_ADD_CO_U32_e64 [[SWIZZLE2]], {{%[0-9]+}}, 0, implicit $exec + +--- | + define amdgpu_kernel void @no_cse(i32 addrspace(1)*, i32, i1) { + entry: + unreachable + if.then: + unreachable + if.then.if.then: + unreachable + if.then.phi: + unreachable + exit: + unreachable + } +... +--- +name: no_cse +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + %0:sgpr_64(p4) = COPY $sgpr4_sgpr5 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0(p4), 0, 0 + %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0(p4), 2, 0 + %3:sreg_64 = COPY %1 + %4:sreg_32 = COPY %2.sub1 + %5:sreg_32 = S_MOV_B32 42 + S_CMP_EQ_U32 %4, %5, implicit-def $scc + %6:vgpr_32 = COPY %5, implicit $exec + S_CBRANCH_SCC1 %bb.4, implicit $scc + S_BRANCH %bb.1 + + bb.1.if.then: + %7:sreg_32 = COPY %2.sub0 + %8:vgpr_32 = COPY %7 + %9:vgpr_32 = DS_SWIZZLE_B32 %8, 100, 0, implicit $exec + %10:vgpr_32, %21:sreg_32 = V_ADD_CO_U32_e64 %9, %5, 0, implicit $exec + S_CMP_LT_I32 %7, %5, implicit-def $scc + S_CBRANCH_SCC1 %bb.3, implicit $scc + S_BRANCH %bb.2 + + bb.2.if.then.if.then: + %11:sreg_32 = S_MOV_B32 64 + %12:vgpr_32 = DS_SWIZZLE_B32 %8, 100, 0, implicit $exec + %13:vgpr_32, %24:sreg_32 = V_ADD_CO_U32_e64 %12, %11, 0, implicit $exec + + bb.3.if.then.phi: + %14:vgpr_32 = PHI %10, %bb.1, %13, %bb.2 + + bb.4.exit: + %15:vgpr_32 = PHI %6, %bb.0, %14, %bb.3 + %16:vreg_64 = COPY %3 + FLAT_STORE_DWORD %16, %15, 0, 0, implicit $exec, implicit $flat_scr + S_ENDPGM 0 + +...