Index: llvm/lib/Target/ARM/MVETailPredication.cpp =================================================================== --- llvm/lib/Target/ARM/MVETailPredication.cpp +++ llvm/lib/Target/ARM/MVETailPredication.cpp @@ -122,6 +122,10 @@ bool runOnLoop(Loop *L, LPPassManager&) override; private: + /// Finds and fix a pattern generated by the LSR pass which interferes with + /// tail predication. + void FixLSRPattern(Loop *L, ScalarEvolution &SE); + /// Perform the relevant checks on the loop and convert if possible. bool TryConvert(Value *TripCount); @@ -150,6 +154,121 @@ } // end namespace +static bool IsVCTPIntrinsic(const Value *V) { + const IntrinsicInst *Call = dyn_cast(V); + if (!Call) + return false; + + Intrinsic::ID ID = Call->getIntrinsicID(); + return ID == Intrinsic::arm_mve_vctp8 || ID == Intrinsic::arm_mve_vctp16 || + ID == Intrinsic::arm_mve_vctp32 || ID == Intrinsic::arm_mve_vctp64; +} + +// This function attempts to find and rewrite a pattern that the LSR +// pass generates in some situation and which can't be tail-predicated. +// +// Here is an example of IR that LSR can generate: +// +// loopbody: +// %lsr.iv = phi i32 [ %lsr.iv.next, %loopbody ], [ %42, %pred ] +// %44 = add i32 %lsr.iv, -4 +// %45 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %44) #5 +// ; ... etc +// %lsr.iv.next = add nsw i32 %lsr.iv, -4 +// +// That can't be tail-predicated because the VCTP's operand is defined in the +// loop, so this function will transform this into: +// +// pred: +// %44 = add i32 %42, -4 +// loopbody: +// %lsr.iv = phi i32 [ %lsr.iv.next, %loopbody ], [ %42, %pred ] +// %lsr.fixed = phi i32 [ %lsr.iv.next, %loopbody ], [ %44, %pred ] +// %45 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %lsr.fixed) +// ; ... etc +// %lsr.iv.next = add nsw i32 %lsr.iv, -4 +// +// Which is equivalent to what LSR generated, but can be tail-predicated just +// fine. +void MVETailPredication::FixLSRPattern(Loop *L, ScalarEvolution &SE) { + // This pattern has only been found in single-block loops for now, so let's + // keep this simple. + if (L->getBlocks().size() != 1) + return; + BasicBlock *BodyBB = L->getHeader(); + BasicBlock *LoopPredBB = L->getLoopPredecessor(); + + // VCTPAdd should be an add of a PHI value and a constant, used only by a VCTP + // intrisic. + Instruction *VCTPAdd = nullptr; + // The VCTP that uses VCTPAdd. + Instruction *VCTP = nullptr; + // PHIAdd should be an add with the same type/operands as VCTPAdd, used only + // by a PHI instruction in the loop's body. + Instruction *PHIAdd = nullptr; + + for (Instruction &I : BodyBB->instructionsWithoutDebug()) { + // We're only interested in single-use adds. + if (!I.hasOneUse() || I.getOpcode() != Instruction::Add) + continue; + Value *User = *I.user_begin(); + Value *Op0 = I.getOperand(0); + Value *Op1 = I.getOperand(1); + + if (!isa(Op0) || !isa(Op1)) + continue; + + if (!VCTPAdd) { + if (IsVCTPIntrinsic(User)) { + VCTPAdd = &I; + VCTP = cast(User); + } + continue; + } + + PHINode *PHIUser = dyn_cast(User); + if (!PHIUser || PHIUser->getParent() != BodyBB) + continue; + + // Check if this is add is equivalent to VCTPAdd. + if (I.getType() != VCTPAdd->getType() || Op0 != VCTPAdd->getOperand(0) || + Op1 != VCTPAdd->getOperand(1)) + continue; + + PHIAdd = &I; + break; + } + + if (!VCTPAdd || !PHIAdd) + return; + + // Move VCTPAdd in the loop's predecessor. + VCTPAdd->moveBefore(LoopPredBB->getTerminator()); + LLVM_DEBUG(dbgs() << "FixLSRPattern: Moving" << *VCTPAdd + << "\n in the loop's header, before" + << *LoopPredBB->getTerminator() << "\n"); + + // Replace VCTPAdd's operand with the PHI's incoming value of the loop's + // predecessor. + LLVM_DEBUG(dbgs() << "FixLSRPattern: Fixing:" << *VCTPAdd << "\n"); + PHINode *PHIOperand = cast(VCTPAdd->getOperand(0)); + VCTPAdd->setOperand(0, PHIOperand->getIncomingValueForBlock(LoopPredBB)); + LLVM_DEBUG(dbgs() << " ... done:" << *VCTPAdd << "\n"); + + // Insert a new PHI for PHIAdd/VCTPAdd, and replace the uses of VCTPAdd + // within the loop body with that new PHI. + PHINode *NewPHI = PHINode::Create(VCTPAdd->getType(), 2, "lsr.fixed", + BodyBB->getFirstNonPHI()); + NewPHI->addIncoming(VCTPAdd, LoopPredBB); + NewPHI->addIncoming(PHIAdd, BodyBB); + LLVM_DEBUG(dbgs() << "FixLSRPattern: Added PHI:" << *NewPHI << "\n"); + + // Fix the VCTP that uses VCTPAdd so it uses the New PHI instead. + LLVM_DEBUG(dbgs() << "FixLSRPattern: Fixing:" << *VCTP << "\n"); + VCTP->replaceUsesOfWith(VCTPAdd, NewPHI); + LLVM_DEBUG(dbgs() << " ... done:" << *VCTP << "\n"); +} + static bool IsDecrement(Instruction &I) { auto *Call = dyn_cast(&I); if (!Call) @@ -204,6 +323,8 @@ return false; } + FixLSRPattern(L, *SE); + BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) return false; Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tp-lsr-patterns.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tp-lsr-patterns.ll @@ -0,0 +1,93 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mve-tail-predication -disable-mve-tail-predication=false %s -S -o - | FileCheck %s + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv8.1m.main-none-none-eabi" + +define dso_local arm_aapcs_vfpcc void @arm_cmplx_abs_sum_real_f32(float* %0, i32 %1) local_unnamed_addr #0 { +; CHECK-LABEL: @arm_cmplx_abs_sum_real_f32( +; CHECK-NEXT: [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> undef, i32 [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP1:%.*]], 3 +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP1]], 2 +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP1]], -1 +; CHECK-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP12]], 4 +; CHECK-NEXT: [[SMIN:%.*]] = select i1 [[TMP13]], i32 [[TMP12]], i32 4 +; CHECK-NEXT: [[TMP14:%.*]] = sub i32 [[TMP11]], [[SMIN]] +; CHECK-NEXT: [[TMP15:%.*]] = lshr i32 [[TMP14]], 2 +; CHECK-NEXT: [[TMP16:%.*]] = add nuw nsw i32 [[TMP15]], 1 +; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP16]]) +; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP10]], -4 +; CHECK-NEXT: br label [[TMP18:%.*]] +; CHECK: 18: +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[TMP18]] ], [ [[TMP10]], [[TMP2:%.*]] ] +; CHECK-NEXT: [[TMP19:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP26:%.*]], [[TMP18]] ] +; CHECK-NEXT: [[TMP20:%.*]] = phi <4 x i32> [ [[TMP9]], [[TMP2]] ], [ [[TMP24:%.*]], [[TMP18]] ] +; CHECK-NEXT: [[TMP21:%.*]] = phi i32 [ [[TMP16]], [[TMP2]] ], [ [[TMP27:%.*]], [[TMP18]] ] +; CHECK-NEXT: [[LSR_FIXED:%.*]] = phi i32 [ [[TMP17]], [[TMP2]] ], [ [[LSR_IV_NEXT]], [[TMP18]] ] +; CHECK-NEXT: [[TMP22:%.*]] = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 [[LSR_FIXED]]) +; CHECK-NEXT: [[TMP23:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP20]], i32 32, <4 x i1> [[TMP22]]) +; CHECK-NEXT: [[TMP24]] = extractvalue { <4 x float>, <4 x i32> } [[TMP23]], 1 +; CHECK-NEXT: [[TMP25:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP23]], 0 +; CHECK-NEXT: [[TMP26]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP19]], <4 x float> [[TMP25]], <4 x i1> [[TMP22]], <4 x float> [[TMP19]]) +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -4 +; CHECK-NEXT: [[TMP27]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[TMP21]], i32 1) +; CHECK-NEXT: [[TMP28:%.*]] = icmp ne i32 [[TMP27]], 0 +; CHECK-NEXT: br i1 [[TMP28]], label [[TMP18]], label [[TMP29:%.*]] +; CHECK: 29: +; CHECK-NEXT: ret void +; + %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8) + %4 = extractvalue { <4 x i32>, i32 } %3, 0 + %5 = ptrtoint float* %0 to i32 + %6 = insertelement <4 x i32> undef, i32 %5, i32 0 + %7 = add <4 x i32> %6, + %8 = shufflevector <4 x i32> %7, <4 x i32> undef, <4 x i32> zeroinitializer + %9 = add <4 x i32> %4, %8 + %10 = add i32 %1, 3 + %11 = add i32 %1, 2 + %12 = add i32 %1, -1 + %13 = icmp slt i32 %12, 4 + %smin = select i1 %13, i32 %12, i32 4 + %14 = sub i32 %11, %smin + %15 = lshr i32 %14, 2 + %16 = add nuw nsw i32 %15, 1 + call void @llvm.set.loop.iterations.i32(i32 %16) + br label %17 + +17: ; preds = %17, %2 + %lsr.iv = phi i32 [ %lsr.iv.next, %17 ], [ %10, %2 ] + %18 = phi <4 x float> [ zeroinitializer, %2 ], [ %26, %17 ] + %19 = phi <4 x i32> [ %9, %2 ], [ %24, %17 ] + %20 = phi i32 [ %16, %2 ], [ %27, %17 ] + %21 = add i32 %lsr.iv, -4 + %22 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %21) + %23 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %19, i32 32, <4 x i1> %22) + %24 = extractvalue { <4 x float>, <4 x i32> } %23, 1 + %25 = extractvalue { <4 x float>, <4 x i32> } %23, 0 + %26 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %18, <4 x float> %25, <4 x i1> %22, <4 x float> %18) + %lsr.iv.next = add i32 %lsr.iv, -4 + %27 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %20, i32 1) + %28 = icmp ne i32 %27, 0 + br i1 %28, label %17, label %29 + +29: ; preds = %17 + ret void +} + +declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32, i32) #1 +declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1 +declare { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>) #2 +declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #1 +declare void @llvm.set.loop.iterations.i32(i32) #3 +declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-dotprod,-fp16fml,-hwdiv-arm,-sb,-sha2" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } +attributes #3 = { noduplicate nounwind }