Index: llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h =================================================================== --- llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -83,6 +83,11 @@ VectorType *SubTp, ArrayRef Args = None); + InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, + MaybeAlign Alignment, unsigned AddressSpace, + TTI::TargetCostKind CostKind, + const Instruction *I = nullptr); + InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind); Index: llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -10,6 +10,7 @@ #include "MCTargetDesc/RISCVMatInt.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/CodeGen/CostTable.h" #include "llvm/CodeGen/TargetLowering.h" #include using namespace llvm; @@ -180,8 +181,11 @@ if (Kind == TTI::SK_Splice && isa(Tp)) return getSpliceCost(Tp, Index); + // TODO: Add Cost Model for SK_PermuteSingleSrc/SK_Select + // TODO: For a fixed-width vector, add Cost Model for SK_Reverse/SK_Broadcast std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); - if (Kind == TTI::SK_Broadcast && isa(Tp)) + if ((Kind == TTI::SK_Broadcast || Kind == TTI::SK_Reverse) && + isa(Tp)) return LT.first * 1; return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); @@ -194,7 +198,6 @@ if (!isa(Src)) return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); - return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); } @@ -224,6 +227,32 @@ return NumLoads * MemOpCost; } +InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, + MaybeAlign Alignment, + unsigned AddressSpace, + TTI::TargetCostKind CostKind, + const Instruction *I) { + EVT VT = TLI->getValueType(DL, Ty, true); + // Type legalization can't handle structs + if (VT == MVT::Other) + return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, + CostKind); + + auto LT = TLI->getTypeLegalizationCost(DL, Ty); + if (!LT.first.isValid()) + return InstructionCost::getInvalid(); + + // TODO: consider latency as well for TCK_SizeAndLatency. + if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) + return LT.first; + + if (CostKind != TTI::TCK_RecipThroughput) + return 1; + + // TODO: Check truncation stores and extending loads + return LT.first; +} + InstructionCost RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) { Index: llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll =================================================================== --- llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll +++ llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll @@ -55,20 +55,20 @@ define void @vector_reverse() { ; CHECK-LABEL: 'vector_reverse' -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %reverse_nxv16i8 = call @llvm.experimental.vector.reverse.nxv16i8( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %reverse_nxv32i8 = call @llvm.experimental.vector.reverse.nxv32i8( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %reverse_nxv2i16 = call @llvm.experimental.vector.reverse.nxv2i16( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %reverse_nxv4i16 = call @llvm.experimental.vector.reverse.nxv4i16( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %reverse_nxv8i16 = call @llvm.experimental.vector.reverse.nxv8i16( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %reverse_nxv16i16 = call @llvm.experimental.vector.reverse.nxv16i16( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %reverse_nxv4i32 = call @llvm.experimental.vector.reverse.nxv4i32( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %reverse_nxv8i32 = call @llvm.experimental.vector.reverse.nxv8i32( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %reverse_nxv2i64 = call @llvm.experimental.vector.reverse.nxv2i64( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %reverse_nxv4i64 = call @llvm.experimental.vector.reverse.nxv4i64( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %reverse_nxv16i1 = call @llvm.experimental.vector.reverse.nxv16i1( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %reverse_nxv8i1 = call @llvm.experimental.vector.reverse.nxv8i1( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %reverse_nxv4i1 = call @llvm.experimental.vector.reverse.nxv4i1( undef) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %reverse_nxv2i1 = call @llvm.experimental.vector.reverse.nxv2i1( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv16i8 = call @llvm.experimental.vector.reverse.nxv16i8( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv32i8 = call @llvm.experimental.vector.reverse.nxv32i8( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2i16 = call @llvm.experimental.vector.reverse.nxv2i16( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4i16 = call @llvm.experimental.vector.reverse.nxv4i16( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8i16 = call @llvm.experimental.vector.reverse.nxv8i16( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv16i16 = call @llvm.experimental.vector.reverse.nxv16i16( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4i32 = call @llvm.experimental.vector.reverse.nxv4i32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8i32 = call @llvm.experimental.vector.reverse.nxv8i32( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2i64 = call @llvm.experimental.vector.reverse.nxv2i64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4i64 = call @llvm.experimental.vector.reverse.nxv4i64( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv16i1 = call @llvm.experimental.vector.reverse.nxv16i1( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv8i1 = call @llvm.experimental.vector.reverse.nxv8i1( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv4i1 = call @llvm.experimental.vector.reverse.nxv4i1( undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %reverse_nxv2i1 = call @llvm.experimental.vector.reverse.nxv2i1( undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %reverse_nxv16i8 = call @llvm.experimental.vector.reverse.nxv16i8( undef) Index: llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -0,0 +1,292 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; REQUIRES: asserts +; RUN: opt -loop-vectorize -dce -instcombine -mtriple riscv64-linux-gnu \ +; RUN: -mattr=+v -debug-only=loop-vectorize \ +; RUN: -riscv-v-vector-bits-min=128 -S < %s 2>&1 | FileCheck %s + +define dso_local void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, i32 noundef signext %n) local_unnamed_addr #0 { +; CHECK-LABEL: @vector_reverse_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP1]], 3 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; CHECK: vector.scevcheck: +; CHECK-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[TMP3]], 4294967295 +; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] +; CHECK-NEXT: br i1 [[TMP8]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2 +; CHECK-NEXT: [[TMP12:%.*]] = add nuw nsw i64 [[TMP11]], 4 +; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP0]], -4 +; CHECK-NEXT: [[TMP13:%.*]] = add nsw i64 [[DOTNEG]], [[TMP12]] +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[TMP11]], 4 +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP14]] +; CHECK-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP13]] +; CHECK-NEXT: [[UGLYGEP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP3]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[UGLYGEP2]], [[UGLYGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = shl i64 [[TMP15]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP16]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: [[CAST_VTC:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END5:%.*]] = sub i32 [[N]], [[CAST_VTC]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[INDEX]] to i32 +; CHECK-NEXT: [[TMP18:%.*]] = xor i32 [[TMP17]], -1 +; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], [[N]] +; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[DOTNEG10:%.*]] = mul i32 [[TMP22]], -4 +; CHECK-NEXT: [[TMP23:%.*]] = or i32 [[DOTNEG10]], 1 +; CHECK-NEXT: [[TMP24:%.*]] = sext i32 [[TMP23]] to i64 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP24]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP25]], align 4, !alias.scope !0 +; CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP27:%.*]] = shl i32 [[TMP26]], 2 +; CHECK-NEXT: [[TMP28:%.*]] = sub i32 0, [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = sub i32 1, [[TMP27]] +; CHECK-NEXT: [[TMP30:%.*]] = sext i32 [[TMP28]] to i64 +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP30]] +; CHECK-NEXT: [[TMP32:%.*]] = sext i32 [[TMP29]] to i64 +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i64 [[TMP32]] +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP33]], align 4, !alias.scope !0 +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP35:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP36:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[DOTNEG11:%.*]] = mul i32 [[TMP36]], -4 +; CHECK-NEXT: [[TMP37:%.*]] = or i32 [[DOTNEG11]], 1 +; CHECK-NEXT: [[TMP38:%.*]] = sext i32 [[TMP37]] to i64 +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP34]], i64 [[TMP38]] +; CHECK-NEXT: store [[TMP35]], ptr [[TMP39]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[TMP40:%.*]] = add [[WIDE_LOAD6]], shufflevector ( insertelement ( poison, i32 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP42:%.*]] = shl i32 [[TMP41]], 2 +; CHECK-NEXT: [[TMP43:%.*]] = sub i32 0, [[TMP42]] +; CHECK-NEXT: [[TMP44:%.*]] = sub i32 1, [[TMP42]] +; CHECK-NEXT: [[TMP45:%.*]] = sext i32 [[TMP43]] to i64 +; CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP34]], i64 [[TMP45]] +; CHECK-NEXT: [[TMP47:%.*]] = sext i32 [[TMP44]] to i64 +; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, ptr [[TMP46]], i64 [[TMP47]] +; CHECK-NEXT: store [[TMP40]], ptr [[TMP48]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[TMP49:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP50:%.*]] = shl i64 [[TMP49]], 3 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP50]] +; CHECK-NEXT: [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP51]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_MOD_VF]], [[MIDDLE_BLOCK]] ], [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[TMP0]], [[VECTOR_SCEVCHECK]] ], [ [[TMP0]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[IND_END5]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ], [ [[N]], [[VECTOR_SCEVCHECK]] ], [ [[N]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ], [ [[I_0:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_0]] = add nsw i32 [[I_0_IN8]], -1 +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP52:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ADD9:%.*]] = add i32 [[TMP52]], 1 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: store i32 [[ADD9]], ptr [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]] +; +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %0 = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] + %i.0 = add nsw i32 %i.0.in8, -1 + %idxprom = zext i32 %i.0 to i64 + %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom + %1 = load i32, ptr %arrayidx, align 4 + %add9 = add i32 %1, 1 + %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom + store i32 %add9, ptr %arrayidx3, align 4 + %cmp = icmp ugt i64 %indvars.iv, 1 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0 +} + +; Function Attrs: argmemonly nofree norecurse nosync nounwind +define dso_local void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, i32 noundef signext %n) local_unnamed_addr #0 { +; CHECK-LABEL: @vector_reverse_f32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP1]], 3 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; CHECK: vector.scevcheck: +; CHECK-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP0]], -1 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[TMP3]], 4294967295 +; CHECK-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] +; CHECK-NEXT: br i1 [[TMP8]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2 +; CHECK-NEXT: [[TMP12:%.*]] = add nuw nsw i64 [[TMP11]], 4 +; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP0]], -4 +; CHECK-NEXT: [[TMP13:%.*]] = add nsw i64 [[DOTNEG]], [[TMP12]] +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[TMP11]], 4 +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP14]] +; CHECK-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP13]] +; CHECK-NEXT: [[UGLYGEP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[UGLYGEP]], [[UGLYGEP3]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[UGLYGEP2]], [[UGLYGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = shl i64 [[TMP15]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP16]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: [[CAST_VTC:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END5:%.*]] = sub i32 [[N]], [[CAST_VTC]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[INDEX]] to i32 +; CHECK-NEXT: [[TMP18:%.*]] = xor i32 [[TMP17]], -1 +; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], [[N]] +; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[DOTNEG10:%.*]] = mul i32 [[TMP22]], -4 +; CHECK-NEXT: [[TMP23:%.*]] = or i32 [[DOTNEG10]], 1 +; CHECK-NEXT: [[TMP24:%.*]] = sext i32 [[TMP23]] to i64 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP24]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP25]], align 4, !alias.scope !9 +; CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP27:%.*]] = shl i32 [[TMP26]], 2 +; CHECK-NEXT: [[TMP28:%.*]] = sub i32 0, [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = sub i32 1, [[TMP27]] +; CHECK-NEXT: [[TMP30:%.*]] = sext i32 [[TMP28]] to i64 +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[TMP30]] +; CHECK-NEXT: [[TMP32:%.*]] = sext i32 [[TMP29]] to i64 +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP31]], i64 [[TMP32]] +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP33]], align 4, !alias.scope !9 +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP35:%.*]] = fadd [[WIDE_LOAD]], shufflevector ( insertelement ( poison, float 1.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP36:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[DOTNEG11:%.*]] = mul i32 [[TMP36]], -4 +; CHECK-NEXT: [[TMP37:%.*]] = or i32 [[DOTNEG11]], 1 +; CHECK-NEXT: [[TMP38:%.*]] = sext i32 [[TMP37]] to i64 +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP38]] +; CHECK-NEXT: store [[TMP35]], ptr [[TMP39]], align 4, !alias.scope !12, !noalias !9 +; CHECK-NEXT: [[TMP40:%.*]] = fadd [[WIDE_LOAD6]], shufflevector ( insertelement ( poison, float 1.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP42:%.*]] = shl i32 [[TMP41]], 2 +; CHECK-NEXT: [[TMP43:%.*]] = sub i32 0, [[TMP42]] +; CHECK-NEXT: [[TMP44:%.*]] = sub i32 1, [[TMP42]] +; CHECK-NEXT: [[TMP45:%.*]] = sext i32 [[TMP43]] to i64 +; CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP45]] +; CHECK-NEXT: [[TMP47:%.*]] = sext i32 [[TMP44]] to i64 +; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, ptr [[TMP46]], i64 [[TMP47]] +; CHECK-NEXT: store [[TMP40]], ptr [[TMP48]], align 4, !alias.scope !12, !noalias !9 +; CHECK-NEXT: [[TMP49:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP50:%.*]] = shl i64 [[TMP49]], 3 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP50]] +; CHECK-NEXT: [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP51]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_MOD_VF]], [[MIDDLE_BLOCK]] ], [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[TMP0]], [[VECTOR_SCEVCHECK]] ], [ [[TMP0]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[IND_END5]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ], [ [[N]], [[VECTOR_SCEVCHECK]] ], [ [[N]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_0_IN8:%.*]] = phi i32 [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ], [ [[I_0:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_0]] = add nsw i32 [[I_0_IN8]], -1 +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP52:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CONV1:%.*]] = fadd float [[TMP52]], 1.000000e+00 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: store float [[CONV1]], ptr [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP15:![0-9]+]] +; +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %0 = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] + %i.0 = add nsw i32 %i.0.in8, -1 + %idxprom = zext i32 %i.0 to i64 + %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom + %1 = load float, ptr %arrayidx, align 4 + %conv1 = fadd float %1, 1.000000e+00 + %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom + store float %conv1, ptr %arrayidx3, align 4 + %cmp = icmp ugt i64 %indvars.iv, 1 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0 +} + +attributes #0 = { argmemonly nofree norecurse nosync nounwind "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+64bit,+a,+c,+m,+relax,-save-restore" } + +!0 = distinct !{!0, !1, !2, !3, !4} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!4 = !{!"llvm.loop.vectorize.enable", i1 true}