Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -513,6 +513,9 @@ bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, TargetTransformInfo::LSRCost &C2) const; + /// Return true if LSR can change I's operands. + bool canLSRFixupInstruction(Instruction *I) const; + /// Return true if the target can fuse a compare and branch. /// Loop-strength-reduction (LSR) uses that knowledge to adjust its cost /// calculation for the instructions in a loop. @@ -1210,6 +1213,7 @@ Instruction *I) = 0; virtual bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, TargetTransformInfo::LSRCost &C2) = 0; + virtual bool canLSRFixupInstruction(Instruction* I) = 0; virtual bool canMacroFuseCmp() = 0; virtual bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, @@ -1495,6 +1499,9 @@ TargetTransformInfo::LSRCost &C2) override { return Impl.isLSRCostLess(C1, C2); } + bool canLSRFixupInstruction(Instruction *I) override { + return Impl.canLSRFixupInstruction(I); + } bool canMacroFuseCmp() override { return Impl.canMacroFuseCmp(); } bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -165,6 +165,8 @@ C2.ScaleCost, C2.ImmCost, C2.SetupCost); } + bool canLSRFixupInstruction(Instruction *I) { return true; } + bool canMacroFuseCmp() { return false; } bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI, Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -262,6 +262,10 @@ return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); } + bool canLSRFixupInstruction(Instruction *I) { + return TargetTransformInfoImplBase::canLSRFixupInstruction(I); + } + int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) { TargetLoweringBase::AddrMode AM; Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -259,6 +259,10 @@ return TTIImpl->isLSRCostLess(C1, C2); } +bool TargetTransformInfo::canLSRFixupInstruction(Instruction *I) const { + return TTIImpl->canLSRFixupInstruction(I); +} + bool TargetTransformInfo::canMacroFuseCmp() const { return TTIImpl->canMacroFuseCmp(); } Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -150,6 +150,8 @@ return ST->getMaxInterleaveFactor(); } + bool canLSRFixupInstruction(Instruction *I); + bool isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment); bool isLegalMaskedStore(Type *DataTy, MaybeAlign Alignment) { Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsARM.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/MC/SubtargetFeature.h" @@ -509,6 +510,23 @@ return BaseT::getAddressComputationCost(Ty, SE, Ptr); } +bool ARMTTIImpl::canLSRFixupInstruction(Instruction *I) { + if (IntrinsicInst *II = dyn_cast(I)) { + // Do not allow LSR to fixup the vctp intrinsic if present, as it could hurt + // tail predication. + switch (II->getIntrinsicID()) { + case Intrinsic::arm_mve_vctp8: + case Intrinsic::arm_mve_vctp16: + case Intrinsic::arm_mve_vctp32: + case Intrinsic::arm_mve_vctp64: + return false; + default: + break; + } + } + return true; +} + bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) { if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps()) return false; Index: llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -2001,6 +2001,7 @@ void GenerateCrossUseConstantOffsets(); void GenerateAllReuseFormulae(); + void FilterOutUndesirableUses(); void FilterOutUndesirableDedicatedRegisters(); size_t EstimateSearchSpaceComplexity() const; @@ -4293,6 +4294,29 @@ print_uses(dbgs())); } +/// If TTI->canLSRFixupInstruction returns false for any UserInst of a LSRUse's +/// Fixups, remove the LSRUse entirely. +void LSRInstance::FilterOutUndesirableUses() { + auto Iter = Uses.begin(); + while (Iter != Uses.end()) { + bool IsDead = false; + for (LSRFixup &Fixup : Iter->Fixups) { + if (!TTI.canLSRFixupInstruction(Fixup.UserInst)) { + IsDead = true; + break; + } + } + + if (IsDead) { + LLVM_DEBUG( + dbgs() << "Discarding use that contains a fixup forbidden by TTI: "; + Iter->dump()); + Iter = Uses.erase(Iter); + } else + ++Iter; + } +} + /// If there are multiple formulae with the same set of registers used /// by other uses, pick the best one and delete the others. void LSRInstance::FilterOutUndesirableDedicatedRegisters() { @@ -5571,6 +5595,7 @@ // to formulate the values needed for the uses. GenerateAllReuseFormulae(); + FilterOutUndesirableUses(); FilterOutUndesirableDedicatedRegisters(); NarrowSearchSpaceUsingHeuristics(); Index: llvm/test/Transforms/LoopStrengthReduce/ARM/forbidden-fixups.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopStrengthReduce/ARM/forbidden-fixups.ll @@ -0,0 +1,274 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=thumbv8.1m.main -mattr=+mve %s -S -loop-reduce -o - | FileCheck %s +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv8.1m-arm-none-eabi" + +define dso_local arm_aapcs_vfpcc float @vctp8(float* %0, i32 %1) local_unnamed_addr #0 { +; CHECK-LABEL: @vctp8( +; CHECK-NEXT: [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP1]], 3 +; CHECK-NEXT: br label [[TMP12:%.*]] +; CHECK: 12: +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[TMP12]] ], [ [[TMP11]], [[TMP2:%.*]] ] +; CHECK-NEXT: [[TMP13:%.*]] = phi i32 [ [[TMP5]], [[TMP2]] ], [ [[TMP21:%.*]], [[TMP12]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP20:%.*]], [[TMP12]] ] +; CHECK-NEXT: [[TMP15:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP18:%.*]], [[TMP12]] ] +; CHECK-NEXT: [[TMP16:%.*]] = tail call <16 x i1> @llvm.arm.mve.vctp8(i32 [[TMP13]]) +; CHECK-NEXT: [[MASK:%.*]] = tail call <4 x i1> @v16i1_to_v4i1(<16 x i1> [[TMP16]]) +; CHECK-NEXT: [[TMP17:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP15]], i32 32, <4 x i1> [[MASK]]) +; CHECK-NEXT: [[TMP18]] = extractvalue { <4 x float>, <4 x i32> } [[TMP17]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP17]], 0 +; CHECK-NEXT: [[TMP20]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP14]], <4 x float> [[TMP19]], <4 x i1> [[MASK]], <4 x float> [[TMP14]]) +; CHECK-NEXT: [[TMP21]] = add nsw i32 [[TMP13]], -4 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -4 +; CHECK-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[LSR_IV_NEXT]], 4 +; CHECK-NEXT: br i1 [[TMP22]], label [[TMP12]], label [[TMP23:%.*]] +; CHECK: 23: +; CHECK-NEXT: [[TMP24:%.*]] = tail call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP20]]) #4 +; CHECK-NEXT: [[TMP25:%.*]] = sitofp i32 [[TMP24]] to float +; CHECK-NEXT: [[TMP26:%.*]] = tail call float @llvm.fabs.f32(float [[TMP25]]) +; CHECK-NEXT: ret float [[TMP26]] +; + %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8) + %4 = extractvalue { <4 x i32>, i32 } %3, 0 + %5 = add nsw i32 %1, -1 + %6 = ptrtoint float* %0 to i32 + %7 = insertelement <4 x i32> undef, i32 %6, i32 0 + %8 = add <4 x i32> %7, + %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer + %10 = add <4 x i32> %4, %9 + br label %11 + +11: ; preds = %11, %2 + %12 = phi i32 [ %5, %2 ], [ %20, %11 ] + %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ] + %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ] + %15 = tail call <16 x i1> @llvm.arm.mve.vctp8(i32 %12) + %mask = tail call <4 x i1> @v16i1_to_v4i1(<16 x i1> %15) + %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %mask) + %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1 + %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0 + %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %mask, <4 x float> %13) + %20 = add nsw i32 %12, -4 + %21 = icmp sgt i32 %12, 4 + br i1 %21, label %11, label %22 + +22: ; preds = %11 + %23 = tail call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19) #0 + %24 = sitofp i32 %23 to float + %25 = tail call float @llvm.fabs.f32(float %24) + ret float %25 +} + +define dso_local arm_aapcs_vfpcc float @vctp16(float* %0, i32 %1) local_unnamed_addr #0 { +; CHECK-LABEL: @vctp16( +; CHECK-NEXT: [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP1]], 3 +; CHECK-NEXT: br label [[TMP12:%.*]] +; CHECK: 12: +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[TMP12]] ], [ [[TMP11]], [[TMP2:%.*]] ] +; CHECK-NEXT: [[TMP13:%.*]] = phi i32 [ [[TMP5]], [[TMP2]] ], [ [[TMP21:%.*]], [[TMP12]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP20:%.*]], [[TMP12]] ] +; CHECK-NEXT: [[TMP15:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP18:%.*]], [[TMP12]] ] +; CHECK-NEXT: [[TMP16:%.*]] = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP13]]) +; CHECK-NEXT: [[MASK:%.*]] = tail call <4 x i1> @v8i1_to_v4i1(<8 x i1> [[TMP16]]) +; CHECK-NEXT: [[TMP17:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP15]], i32 32, <4 x i1> [[MASK]]) +; CHECK-NEXT: [[TMP18]] = extractvalue { <4 x float>, <4 x i32> } [[TMP17]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP17]], 0 +; CHECK-NEXT: [[TMP20]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP14]], <4 x float> [[TMP19]], <4 x i1> [[MASK]], <4 x float> [[TMP14]]) +; CHECK-NEXT: [[TMP21]] = add nsw i32 [[TMP13]], -4 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -4 +; CHECK-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[LSR_IV_NEXT]], 4 +; CHECK-NEXT: br i1 [[TMP22]], label [[TMP12]], label [[TMP23:%.*]] +; CHECK: 23: +; CHECK-NEXT: [[TMP24:%.*]] = tail call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP20]]) #4 +; CHECK-NEXT: [[TMP25:%.*]] = sitofp i32 [[TMP24]] to float +; CHECK-NEXT: [[TMP26:%.*]] = tail call float @llvm.fabs.f32(float [[TMP25]]) +; CHECK-NEXT: ret float [[TMP26]] +; + %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8) + %4 = extractvalue { <4 x i32>, i32 } %3, 0 + %5 = add nsw i32 %1, -1 + %6 = ptrtoint float* %0 to i32 + %7 = insertelement <4 x i32> undef, i32 %6, i32 0 + %8 = add <4 x i32> %7, + %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer + %10 = add <4 x i32> %4, %9 + br label %11 + +11: ; preds = %11, %2 + %12 = phi i32 [ %5, %2 ], [ %20, %11 ] + %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ] + %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ] + %15 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %12) + %mask = tail call <4 x i1> @v8i1_to_v4i1(<8 x i1> %15) + %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %mask) + %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1 + %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0 + %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %mask, <4 x float> %13) + %20 = add nsw i32 %12, -4 + %21 = icmp sgt i32 %12, 4 + br i1 %21, label %11, label %22 + +22: ; preds = %11 + %23 = tail call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19) #0 + %24 = sitofp i32 %23 to float + %25 = tail call float @llvm.fabs.f32(float %24) + ret float %25 +} + +define dso_local arm_aapcs_vfpcc float @vctpi32(float* %0, i32 %1) local_unnamed_addr #0 { +; CHECK-LABEL: @vctpi32( +; CHECK-NEXT: [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP1]], 3 +; CHECK-NEXT: br label [[TMP12:%.*]] +; CHECK: 12: +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[TMP12]] ], [ [[TMP11]], [[TMP2:%.*]] ] +; CHECK-NEXT: [[TMP13:%.*]] = phi i32 [ [[TMP5]], [[TMP2]] ], [ [[TMP21:%.*]], [[TMP12]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP20:%.*]], [[TMP12]] ] +; CHECK-NEXT: [[TMP15:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP18:%.*]], [[TMP12]] ] +; CHECK-NEXT: [[TMP16:%.*]] = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP13]]) +; CHECK-NEXT: [[TMP17:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP15]], i32 32, <4 x i1> [[TMP16]]) +; CHECK-NEXT: [[TMP18]] = extractvalue { <4 x float>, <4 x i32> } [[TMP17]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP17]], 0 +; CHECK-NEXT: [[TMP20]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP14]], <4 x float> [[TMP19]], <4 x i1> [[TMP16]], <4 x float> [[TMP14]]) +; CHECK-NEXT: [[TMP21]] = add nsw i32 [[TMP13]], -4 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -4 +; CHECK-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[LSR_IV_NEXT]], 4 +; CHECK-NEXT: br i1 [[TMP22]], label [[TMP12]], label [[TMP23:%.*]] +; CHECK: 23: +; CHECK-NEXT: [[TMP24:%.*]] = tail call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP20]]) +; CHECK-NEXT: [[TMP25:%.*]] = sitofp i32 [[TMP24]] to float +; CHECK-NEXT: [[TMP26:%.*]] = tail call float @llvm.fabs.f32(float [[TMP25]]) +; CHECK-NEXT: ret float [[TMP26]] +; + %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8) + %4 = extractvalue { <4 x i32>, i32 } %3, 0 + %5 = add nsw i32 %1, -1 + %6 = ptrtoint float* %0 to i32 + %7 = insertelement <4 x i32> undef, i32 %6, i32 0 + %8 = add <4 x i32> %7, + %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer + %10 = add <4 x i32> %4, %9 + br label %11 + +11: ; preds = %11, %2 + %12 = phi i32 [ %5, %2 ], [ %20, %11 ] + %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ] + %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ] + %15 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %12) + %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %15) + %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1 + %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0 + %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %15, <4 x float> %13) + %20 = add nsw i32 %12, -4 + %21 = icmp sgt i32 %12, 4 + br i1 %21, label %11, label %22 + +22: ; preds = %11 + %23 = tail call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19) #5 + %24 = sitofp i32 %23 to float + %25 = tail call float @llvm.fabs.f32(float %24) + ret float %25 +} + + +define dso_local arm_aapcs_vfpcc float @vctpi64(float* %0, i32 %1) local_unnamed_addr #0 { +; CHECK-LABEL: @vctpi64( +; CHECK-NEXT: [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1 +; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP1]], 3 +; CHECK-NEXT: br label [[TMP12:%.*]] +; CHECK: 12: +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[TMP12]] ], [ [[TMP11]], [[TMP2:%.*]] ] +; CHECK-NEXT: [[TMP13:%.*]] = phi i32 [ [[TMP5]], [[TMP2]] ], [ [[TMP21:%.*]], [[TMP12]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP20:%.*]], [[TMP12]] ] +; CHECK-NEXT: [[TMP15:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP18:%.*]], [[TMP12]] ] +; CHECK-NEXT: [[TMP16:%.*]] = tail call <4 x i1> @llvm.arm.mve.vctp64(i32 [[TMP13]]) +; CHECK-NEXT: [[TMP17:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP15]], i32 32, <4 x i1> [[TMP16]]) +; CHECK-NEXT: [[TMP18]] = extractvalue { <4 x float>, <4 x i32> } [[TMP17]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP17]], 0 +; CHECK-NEXT: [[TMP20]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP14]], <4 x float> [[TMP19]], <4 x i1> [[TMP16]], <4 x float> [[TMP14]]) +; CHECK-NEXT: [[TMP21]] = add nsw i32 [[TMP13]], -4 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -4 +; CHECK-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[LSR_IV_NEXT]], 4 +; CHECK-NEXT: br i1 [[TMP22]], label [[TMP12]], label [[TMP23:%.*]] +; CHECK: 23: +; CHECK-NEXT: [[TMP24:%.*]] = tail call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP20]]) #4 +; CHECK-NEXT: [[TMP25:%.*]] = sitofp i32 [[TMP24]] to float +; CHECK-NEXT: [[TMP26:%.*]] = tail call float @llvm.fabs.f32(float [[TMP25]]) +; CHECK-NEXT: ret float [[TMP26]] +; + %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8) + %4 = extractvalue { <4 x i32>, i32 } %3, 0 + %5 = add nsw i32 %1, -1 + %6 = ptrtoint float* %0 to i32 + %7 = insertelement <4 x i32> undef, i32 %6, i32 0 + %8 = add <4 x i32> %7, + %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer + %10 = add <4 x i32> %4, %9 + br label %11 + +11: ; preds = %11, %2 + %12 = phi i32 [ %5, %2 ], [ %20, %11 ] + %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ] + %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ] + %15 = tail call <4 x i1> @llvm.arm.mve.vctp64(i32 %12) + %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %15) + %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1 + %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0 + %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %15, <4 x float> %13) + %20 = add nsw i32 %12, -4 + %21 = icmp sgt i32 %12, 4 + br i1 %21, label %11, label %22 + +22: ; preds = %11 + %23 = tail call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19) #0 + %24 = sitofp i32 %23 to float + %25 = tail call float @llvm.fabs.f32(float %24) + ret float %25 +} + +declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32, i32) #1 +declare <16 x i1> @llvm.arm.mve.vctp8(i32) #1 +declare <8 x i1> @llvm.arm.mve.vctp16(i32) #1 +declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1 +declare <4 x i1> @llvm.arm.mve.vctp64(i32) #1 +declare { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>) #2 +declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #1 +declare dso_local arm_aapcs_vfpcc i32 @vecAddAcrossF32Mve(...) local_unnamed_addr #0 +declare dso_local arm_aapcs_vfpcc <4 x i1> @v8i1_to_v4i1(<8 x i1>) local_unnamed_addr #0 +declare dso_local arm_aapcs_vfpcc <4 x i1> @v16i1_to_v4i1(<16 x i1>) local_unnamed_addr #0 +declare float @llvm.fabs.f32(float) #4 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } +attributes #4 = { nounwind readnone speculatable willreturn }