Index: llvm/lib/CodeGen/MachineCSE.cpp =================================================================== --- llvm/lib/CodeGen/MachineCSE.cpp +++ llvm/lib/CodeGen/MachineCSE.cpp @@ -433,11 +433,6 @@ MachineBasicBlock *CSBB, MachineInstr *MI) { // FIXME: Heuristics that works around the lack the live range splitting. - MachineBasicBlock *BB = MI->getParent(); - // Prevent CSE-ing non-local convergent instructions. - if (MI->isConvergent() && CSBB != BB) - return false; - // If CSReg is used at all uses of Reg, CSE should not increase register // pressure of CSReg. bool MayIncreasePressure = true; @@ -460,6 +455,7 @@ // an immediate predecessor. We don't want to increase register pressure and // end up causing other computation to be spilled. if (TII->isAsCheapAsAMove(*MI)) { + MachineBasicBlock *BB = MI->getParent(); if (CSBB != BB && !CSBB->isSuccessor(BB)) return false; } @@ -592,6 +588,23 @@ LLVM_DEBUG(dbgs() << "Examining: " << *MI); LLVM_DEBUG(dbgs() << "*** Found a common subexpression: " << *CSMI); + // Prevent CSE-ing non-local convergent instructions. + // LLVM's current definition of `isConvergent` does not necessarily prove + // that non-local CSE is illegal. The following check extends the definition + // of `isConvergent` to assume a convergent instruction is dependent not + // only on additional conditions, but also on fewer conditions. LLVM does + // not have a MachineInstr attribute which expresses this extended + // definition, so it's necessary to use `isConvergent` to prevent illegally + // CSE-ing the subset of `isConvergent` instructions which do fall into this + // extended definition. + if (MI->isConvergent() && MI->getParent() != CSMI->getParent()) { + LLVM_DEBUG(dbgs() << "*** Convergent MI and subexpression exist in " + "different BBs, avoid CSE!\n"); + VNT.insert(MI, CurrVN++); + Exps.push_back(MI); + continue; + } + // Check if it's profitable to perform this CSE. bool DoCSE = true; unsigned NumDefs = MI->getNumDefs(); Index: llvm/test/CodeGen/AMDGPU/GlobalISel/no-cse-nonlocal-convergent-instrs.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/no-cse-nonlocal-convergent-instrs.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/no-cse-nonlocal-convergent-instrs.mir @@ -1,9 +1,22 @@ # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -o - -run-pass=machine-cse %s | FileCheck %s -# Check that we don't CSE non-local convergent instrs. Otherwise, reusing defs -# of convergent instrs from different control flow scopes can cause illegal -# codegen. Previously, the swizzle in bb2 would be CSE-ed in favor of using the -# swizzle in bb1 despite bb2 being a different control flow scope. +# LLVM's current definition of `isConvergent` does not necessarily prove that +# non-local CSE is illegal. The following test extends the definition of +# `isConvergent` to assume a convergent instruction is dependent not only on +# additional conditions, but also on fewer conditions. LLVM does not have a +# MachineInstr attribute which expresses this extended definition, so it's +# necessary to use `isConvergent` to prevent illegally CSE-ing the subset of +# `isConvergent` instructions which do fall into this extended definition. + +# This is a coverage test for the MachineCSE change. It does not reproduce an +# actual bug in the AMDGPU backend. The current open source GPU backends as is +# do not appear to allow a reasonably simple test case that provably and +# undeniably functionally breaks without the associated MachineCSE changes. + +# The test checks that we don't CSE non-local convergent instrs. Otherwise, +# reusing defs of convergent instrs from different control flow scopes can +# cause illegal codegen. Previously, the swizzle in bb2 would be CSE-ed in +# favor of using the swizzle in bb1 despite bb2 being a different BBs. # CHECK-LABEL: name: no_cse # CHECK: bb.1.if.then