Index: llvm/lib/CodeGen/MachineLICM.cpp =================================================================== --- llvm/lib/CodeGen/MachineLICM.cpp +++ llvm/lib/CodeGen/MachineLICM.cpp @@ -1043,6 +1043,9 @@ !IsGuaranteedToExecute(I.getParent())) return false; + if (I.isConvergent()) + return false; + return true; } Index: llvm/lib/Transforms/Scalar/LICM.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LICM.cpp +++ llvm/lib/Transforms/Scalar/LICM.cpp @@ -1160,6 +1160,9 @@ if (CI->mayThrow()) return false; + if (CI->isConvergent()) + return false; + using namespace PatternMatch; if (match(CI, m_Intrinsic())) // Assumes don't actually alias anything or throw Index: llvm/test/CodeGen/AMDGPU/machinelicm-convergent.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/machinelicm-convergent.mir @@ -0,0 +1,32 @@ +# RUN: llc -march=amdgcn -run-pass=early-machinelicm -o - %s | FileCheck %s + +# Test to check machine LICM does not hoist convergent instructions, +# DS_PERMUTE_B32 in this example. + +--- +# CHECK-LABEL: name: _amdgpu_cs_main +# CHECK: bb.1: +# CHECK: DS_PERMUTE_B32 + +name: _amdgpu_cs_main +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + successors: %bb.1 + + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + + bb.1: + successors: %bb.1, %bb.2 + + %2:vgpr_32 = DS_PERMUTE_B32 %0, %1, 0, implicit $exec + %3:vgpr_32 = V_ADD_CO_U32_e32 %0, %2, implicit-def $vcc, implicit $exec + S_CBRANCH_SCC1 %bb.1, implicit undef $scc + S_BRANCH %bb.2 + + bb.2: + $vgpr0 = COPY %3 + S_ENDPGM 0 + +... Index: llvm/test/Transforms/LICM/convergent.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LICM/convergent.ll @@ -0,0 +1,21 @@ +; RUN: opt < %s -S -licm | FileCheck %s + +; Check that we do not hoist convergent functions out of loop +; CHECK: define i32 @test +; CHECK: loop: +; CHECK: call i32 @f + +define i32 @test(i32* nocapture noalias %x, i32* nocapture %y) { +entry: + br label %loop + +loop: + %a = call i32 @f() nounwind readnone convergent + %exitcond = icmp ne i32 %a, 0 + br i1 %exitcond, label %end, label %loop + +end: + ret i32 %a +} + +declare i32 @f() nounwind readnone convergent