diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp --- a/llvm/lib/Target/ARM/MVETailPredication.cpp +++ b/llvm/lib/Target/ARM/MVETailPredication.cpp @@ -260,13 +260,18 @@ // %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, // <4 x i32> undef, // <4 x i32> zeroinitializer - // %induction = add <4 x i32> %broadcast.splat, + // %induction = [add|or] <4 x i32> %broadcast.splat, // %pred = icmp ule <4 x i32> %induction, %broadcast.splat11 - + // + // Please note that the 'or' is equivalent to the 'and' here, this relies on + // BroadcastSplat being the IV which we know is a phi with 0 start and Lanes + // increment, which is all being checked below. Instruction *BroadcastSplat = nullptr; Constant *Const = nullptr; if (!match(TCP.Induction, - m_Add(m_Instruction(BroadcastSplat), m_Constant(Const)))) + m_Add(m_Instruction(BroadcastSplat), m_Constant(Const))) && + !match(TCP.Induction, + m_Or(m_Instruction(BroadcastSplat), m_Constant(Const)))) return false; // Check that we're adding <0, 1, 2, 3... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll @@ -32,7 +32,7 @@ %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer - %induction = add <16 x i32> %broadcast.splat, + %induction = or <16 x i32> %broadcast.splat, %tmp = getelementptr inbounds i8, i8* %a, i32 %index %tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11 %tmp2 = bitcast i8* %tmp to <16 x i8>* @@ -137,7 +137,7 @@ %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ] %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer - %induction = add <4 x i32> %broadcast.splat, + %induction = or <4 x i32> %broadcast.splat, %tmp = getelementptr inbounds i32, i32* %a, i32 %index %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 %tmp2 = bitcast i32* %tmp to <4 x i32>*