Index: llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -514,12 +514,24 @@ WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy); } - // Int inductions are special because we only allow one IV. - if (ID.getKind() == InductionDescriptor::IK_IntInduction && - ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() && + auto IsStepValueOne = [&] () { + if (ID.getConstIntStepValue()->isOne() && isa(ID.getStartValue()) && - cast(ID.getStartValue())->isNullValue()) { + cast(ID.getStartValue())->isNullValue()) + return true; + return false; + }; + auto IsStepValueMinusOne = [&] () { + if (ID.getConstIntStepValue()->isMinusOne()) + return true; + return false; + }; + + // Int inductions are special because we only allow one IV. + if (ID.getKind() == InductionDescriptor::IK_IntInduction && + ID.getConstIntStepValue() && + (IsStepValueOne() || IsStepValueMinusOne())) { // Use the phi node with the widest type as induction. Use the last // one if there are multiple (no good reason for doing this other // than it is expedient). We've checked that it begins at zero and Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1825,13 +1825,19 @@ return cast(Step)->getValue(); }; + auto IsStepValueMinusOne = [&] () -> bool { + return ID.getConstIntStepValue() && + ID.getConstIntStepValue()->isMinusOne(); + }; + // The scalar value to broadcast. This is derived from the canonical // induction variable. If a truncation type is given, truncate the canonical // induction variable and step. Otherwise, derive these values from the // induction descriptor. auto CreateScalarIV = [&](Value *&Step) -> Value * { Value *ScalarIV = Induction; - if (IV != OldInduction) { + + if (IV != OldInduction || IsStepValueMinusOne()) { ScalarIV = IV->getType()->isIntegerTy() ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) : Builder.CreateCast(Instruction::SIToFP, Induction, Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll +++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll @@ -1,14 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -loop-vectorize -S | FileCheck %s + +; Check that when we can't predicate this loop that it is still vectorised (with an epilogue). +; RUN: opt < %s -loop-vectorize -S | FileCheck %s --check-prefix=CHECK-NO-FOLD +; CHECK-NO-FOLD: vector.body +; CHECK-NO-FOLD-NOT: masked + +; Check tail-folding and predicated vector body. ; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s ; RUN: opt < %s -loop-vectorize -disable-mve-tail-predication=false -S | FileCheck %s -; Check that when we can't predicate this loop that it is still vectorised (with -; an epilogue). -; TODO: the reason this can't be predicated is because a primary induction -; variable can't be found (not yet) for this counting down loop. But with that -; fixed, this should be able to be predicated. - target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "thumbv8.1m.main-arm-unknown-eabihf" @@ -18,61 +18,62 @@ ; CHECK-NEXT: [[CMP6:%.*]] = icmp eq i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP6]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] ; CHECK: while.body.preheader: -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 16 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[N]], [[N_VEC]] -; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr i8, i8* [[C:%.*]], i32 [[N_VEC]] -; CHECK-NEXT: [[IND_END4:%.*]] = getelementptr i8, i8* [[B:%.*]], i32 [[N_VEC]] -; CHECK-NEXT: [[IND_END6:%.*]] = getelementptr i8, i8* [[A:%.*]], i32 [[N_VEC]] +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 16 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, i8* [[C:%.*]], i32 [[N_VEC]] +; CHECK-NEXT: [[IND_END3:%.*]] = getelementptr i8, i8* [[B:%.*]], i32 [[N_VEC]] +; CHECK-NEXT: [[IND_END5:%.*]] = getelementptr i8, i8* [[A:%.*]], i32 [[N_VEC]] +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[N]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <16 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT8]], <16 x i32> undef, <16 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[N]], [[INDEX]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> undef, i32 [[OFFSET_IDX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> undef, i32 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> undef, <16 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <16 x i32> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[C]], i32 [[TMP1]] ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, i8* [[B]], i32 [[TMP2]] +; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, i8* [[B]], i32 [[TMP2]] ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, i8* [[A]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[NEXT_GEP8]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <16 x i8>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP5]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[NEXT_GEP7]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <16 x i8>* -; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, <16 x i8>* [[TMP7]], align 1 -; CHECK-NEXT: [[TMP8:%.*]] = add <16 x i8> [[WIDE_LOAD9]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>* -; CHECK-NEXT: store <16 x i8> [[TMP8]], <16 x i8>* [[TMP10]], align 1 +; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, i8* [[A]], i32 [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp ule <16 x i32> [[INDUCTION]], [[BROADCAST_SPLAT9]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[NEXT_GEP7]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP6]], i32 1, <16 x i1> [[TMP4]], <16 x i8> undef) +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[NEXT_GEP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP8]], i32 1, <16 x i1> [[TMP4]], <16 x i8> undef) +; CHECK-NEXT: [[TMP9:%.*]] = add <16 x i8> [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <16 x i8>* +; CHECK-NEXT: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[TMP9]], <16 x i8>* [[TMP11]], i32 1, <16 x i1> [[TMP4]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[WHILE_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i8* [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[C]], [[WHILE_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i8* [ [[IND_END4]], [[MIDDLE_BLOCK]] ], [ [[B]], [[WHILE_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i8* [ [[IND_END6]], [[MIDDLE_BLOCK]] ], [ [[A]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ [[N]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i8* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[C]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i8* [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ [[B]], [[WHILE_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i8* [ [[IND_END5]], [[MIDDLE_BLOCK]] ], [ [[A]], [[WHILE_BODY_PREHEADER]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[N_ADDR_010:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[C_ADDR_09:%.*]] = phi i8* [ [[INCDEC_PTR4:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[B_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR1:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[A_ADDR_07:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL5]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[B_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR1:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[A_ADDR_07:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[A_ADDR_07]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = load i8, i8* [[A_ADDR_07]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = load i8, i8* [[A_ADDR_07]], align 1 ; CHECK-NEXT: [[INCDEC_PTR1]] = getelementptr inbounds i8, i8* [[B_ADDR_08]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = load i8, i8* [[B_ADDR_08]], align 1 -; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP13]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = load i8, i8* [[B_ADDR_08]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP14]], [[TMP13]] ; CHECK-NEXT: [[INCDEC_PTR4]] = getelementptr inbounds i8, i8* [[C_ADDR_09]], i32 1 ; CHECK-NEXT: store i8 [[ADD]], i8* [[C_ADDR_09]], align 1 ; CHECK-NEXT: [[DEC]] = add i32 [[N_ADDR_010]], -1 Index: llvm/test/Transforms/LoopVectorize/reverse_induction.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/reverse_induction.ll +++ llvm/test/Transforms/LoopVectorize/reverse_induction.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" @@ -5,12 +6,82 @@ ; Make sure consecutive vector generates correct negative indices. ; PR15882 -; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] -; CHECK: %offset.idx = sub i64 %startval, %index -; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0 -; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4 - define i32 @reverse_induction_i64(i64 %startval, i32 * %ptr) { +; CHECK-LABEL: @reverse_induction_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], 1024 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] + +; This is the required offset.idx calculation that needs to be preserved: +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[INDEX]] + +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[OFFSET_IDX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[INDUCTION2:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -4 +; CHECK-NEXT: [[OFFSET_IDX3:%.*]] = trunc i64 [[INDEX]] to i32 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> undef, i32 [[OFFSET_IDX3]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION6:%.*]] = add <4 x i32> [[BROADCAST_SPLAT5]], +; CHECK-NEXT: [[INDUCTION7:%.*]] = add <4 x i32> [[BROADCAST_SPLAT5]], +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[OFFSET_IDX3]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[OFFSET_IDX3]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP0]], -1 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], -1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 -3 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4 +; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 -4 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 -3 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4 +; CHECK-NEXT: [[REVERSE10:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD9]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP14]] = add <4 x i32> [[REVERSE]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP15]] = add <4 x i32> [[REVERSE10]], [[VEC_PHI8]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP15]], [[TMP14]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF12:%.*]] = shufflevector <4 x i32> [[BIN_RDX11]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX13:%.*]] = add <4 x i32> [[BIN_RDX11]], [[RDX_SHUF12]] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[BIN_RDX13]], i32 0 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024 +; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEND:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[STARTVAL]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[ADD_I7:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD_I:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_06:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC4:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[REDUX5:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[INC_REDUX:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ADD_I]] = add i64 [[ADD_I7]], -1 +; CHECK-NEXT: [[KIND__I:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[ADD_I]] +; CHECK-NEXT: [[TMP_I1:%.*]] = load i32, i32* [[KIND__I]], align 4 +; CHECK-NEXT: [[INC_REDUX]] = add i32 [[TMP_I1]], [[REDUX5]] +; CHECK-NEXT: [[INC4]] = add i32 [[I_06]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[INC4]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND]], !llvm.loop !2 +; CHECK: loopend: +; CHECK-NEXT: [[INC_REDUX_LCSSA:%.*]] = phi i32 [ [[INC_REDUX]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[INC_REDUX_LCSSA]] +; entry: br label %for.body @@ -30,13 +101,79 @@ ret i32 %inc.redux } -; CHECK-LABEL: @reverse_induction_i128( -; CHECK: %index = phi i128 [ 0, %vector.ph ], [ %index.next, %vector.body ] -; CHECK: %offset.idx = sub i128 %startval, %index -; CHECK: %[[a0:.+]] = add i128 %offset.idx, 0 -; CHECK: %[[a4:.+]] = add i128 %offset.idx, -4 - define i32 @reverse_induction_i128(i128 %startval, i32 * %ptr) { +; CHECK-LABEL: @reverse_induction_i128( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[IND_END:%.*]] = sub i128 [[STARTVAL:%.*]], 1024 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i128 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i128 [[STARTVAL]], [[INDEX]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i128> undef, i128 [[OFFSET_IDX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i128> [[BROADCAST_SPLATINSERT]], <4 x i128> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i128> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[INDUCTION2:%.*]] = add <4 x i128> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = add i128 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i128 [[OFFSET_IDX]], -4 +; CHECK-NEXT: [[OFFSET_IDX3:%.*]] = trunc i128 [[INDEX]] to i32 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> undef, i32 [[OFFSET_IDX3]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION6:%.*]] = add <4 x i32> [[BROADCAST_SPLAT5]], +; CHECK-NEXT: [[INDUCTION7:%.*]] = add <4 x i32> [[BROADCAST_SPLAT5]], +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[OFFSET_IDX3]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[OFFSET_IDX3]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = add i128 [[TMP0]], -1 +; CHECK-NEXT: [[TMP5:%.*]] = add i128 [[TMP1]], -1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i128 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i128 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 -3 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4 +; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 -4 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 -3 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4 +; CHECK-NEXT: [[REVERSE10:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD9]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP14]] = add <4 x i32> [[REVERSE]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP15]] = add <4 x i32> [[REVERSE10]], [[VEC_PHI8]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i128 [[INDEX]], 8 +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i128 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 +; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP15]], [[TMP14]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF12:%.*]] = shufflevector <4 x i32> [[BIN_RDX11]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX13:%.*]] = add <4 x i32> [[BIN_RDX11]], [[RDX_SHUF12]] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[BIN_RDX13]], i32 0 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i128 1024, 1024 +; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEND:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i128 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[STARTVAL]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[ADD_I7:%.*]] = phi i128 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD_I:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_06:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC4:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[REDUX5:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[INC_REDUX:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ADD_I]] = add i128 [[ADD_I7]], -1 +; CHECK-NEXT: [[KIND__I:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i128 [[ADD_I]] +; CHECK-NEXT: [[TMP_I1:%.*]] = load i32, i32* [[KIND__I]], align 4 +; CHECK-NEXT: [[INC_REDUX]] = add i32 [[TMP_I1]], [[REDUX5]] +; CHECK-NEXT: [[INC4]] = add i32 [[I_06]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[INC4]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND]], !llvm.loop !5 +; CHECK: loopend: +; CHECK-NEXT: [[INC_REDUX_LCSSA:%.*]] = phi i32 [ [[INC_REDUX]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[INC_REDUX_LCSSA]] +; entry: br label %for.body @@ -56,13 +193,92 @@ ret i32 %inc.redux } -; CHECK-LABEL: @reverse_induction_i16( -; CHECK: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] -; CHECK: %offset.idx = sub i16 %startval, {{.*}} -; CHECK: %[[a0:.+]] = add i16 %offset.idx, 0 -; CHECK: %[[a4:.+]] = add i16 %offset.idx, -4 - define i32 @reverse_induction_i16(i16 %startval, i32 * %ptr) { +; CHECK-LABEL: @reverse_induction_i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; CHECK: vector.scevcheck: +; CHECK-NEXT: [[TMP0:%.*]] = add i16 [[STARTVAL:%.*]], -1 +; CHECK-NEXT: [[MUL:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 1, i16 1023) +; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i16, i1 } [[MUL]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i16, i1 } [[MUL]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = add i16 [[TMP0]], [[MUL_RESULT]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i16 [[TMP0]], [[MUL_RESULT]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt i16 [[TMP2]], [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp slt i16 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = select i1 true, i1 [[TMP3]], i1 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[TMP5]], [[MUL_OVERFLOW]] +; CHECK-NEXT: [[TMP7:%.*]] = or i1 false, [[TMP6]] +; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[IND_END:%.*]] = sub i16 [[STARTVAL]], 1024 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = trunc i32 [[INDEX]] to i16 +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i16 [[STARTVAL]], [[TMP8]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> undef, i16 [[OFFSET_IDX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i16> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[INDUCTION2:%.*]] = add <4 x i16> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP9:%.*]] = add i16 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = add i16 [[OFFSET_IDX]], -4 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION5:%.*]] = add <4 x i32> [[BROADCAST_SPLAT4]], +; CHECK-NEXT: [[INDUCTION6:%.*]] = add <4 x i32> [[BROADCAST_SPLAT4]], +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP13:%.*]] = add i16 [[TMP9]], -1 +; CHECK-NEXT: [[TMP14:%.*]] = add i16 [[TMP10]], -1 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i16 [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i16 [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 -3 +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 +; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 -4 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 -3 +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP22]], align 4 +; CHECK-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD8]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP23]] = add <4 x i32> [[REVERSE]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP24]] = add <4 x i32> [[REVERSE9]], [[VEC_PHI7]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6 +; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP24]], [[TMP23]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF11:%.*]] = shufflevector <4 x i32> [[BIN_RDX10]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX12:%.*]] = add <4 x i32> [[BIN_RDX10]], [[RDX_SHUF11]] +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[BIN_RDX12]], i32 0 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 1024, 1024 +; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEND:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[STARTVAL]], [[ENTRY:%.*]] ], [ [[STARTVAL]], [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[ADD_I7:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD_I:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_06:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC4:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[REDUX5:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[INC_REDUX:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ADD_I]] = add i16 [[ADD_I7]], -1 +; CHECK-NEXT: [[KIND__I:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i16 [[ADD_I]] +; CHECK-NEXT: [[TMP_I1:%.*]] = load i32, i32* [[KIND__I]], align 4 +; CHECK-NEXT: [[INC_REDUX]] = add i32 [[TMP_I1]], [[REDUX5]] +; CHECK-NEXT: [[INC4]] = add i32 [[I_06]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[INC4]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND]], !llvm.loop !7 +; CHECK: loopend: +; CHECK-NEXT: [[INC_REDUX_LCSSA:%.*]] = phi i32 [ [[INC_REDUX]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[INC_REDUX_LCSSA]] +; entry: br label %for.body @@ -99,13 +315,72 @@ ; } ; } -; CHECK-LABEL: @reverse_forward_induction_i64_i8( -; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] -; CHECK: %offset.idx = sub i64 1023, %index -; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0 -; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4 - define void @reverse_forward_induction_i64_i8() { +; CHECK-LABEL: @reverse_forward_induction_i64_i8( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[OFFSET_IDX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[INDUCTION2:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -4 +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i8> [[VEC_IND]], +; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i8> [[VEC_IND]], +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i8> [[STEP_ADD]], +; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP1]] +; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 -3 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[REVERSE]], <4 x i32>* [[TMP10]], align 4 +; CHECK-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 -4 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 -3 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[REVERSE4]], <4 x i32>* [[TMP13]], align 4 +; CHECK-NEXT: [[OFFSET_IDX5:%.*]] = sub i64 1023, [[INDEX]] +; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[OFFSET_IDX5]] to i32 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> undef, i32 [[TMP14]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION8:%.*]] = add <4 x i32> [[BROADCAST_SPLAT7]], +; CHECK-NEXT: [[INDUCTION9:%.*]] = add <4 x i32> [[BROADCAST_SPLAT7]], +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], -4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[STEP_ADD]], +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8 +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024 +; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i8 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[WHILE_BODY:%.*]] +; CHECK: while.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[WHILE_BODY]] ] +; CHECK-NEXT: [[FORWARD_INDUCTION_05:%.*]] = phi i8 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[WHILE_BODY]] ] +; CHECK-NEXT: [[INC]] = add i8 [[FORWARD_INDUCTION_05]], 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[INC]] to i32 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[CONV]], i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], -1 +; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP18]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[WHILE_BODY]], label [[WHILE_END]], !llvm.loop !9 +; CHECK: while.end: +; CHECK-NEXT: ret void +; entry: br label %while.body @@ -125,13 +400,72 @@ ret void } -; CHECK-LABEL: @reverse_forward_induction_i64_i8_signed( -; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] -; CHECK: %offset.idx = sub i64 1023, %index -; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0 -; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4 - define void @reverse_forward_induction_i64_i8_signed() { +; CHECK-LABEL: @reverse_forward_induction_i64_i8_signed( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[OFFSET_IDX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[INDUCTION2:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -4 +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i8> [[VEC_IND]], +; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i8> [[VEC_IND]], +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i8> [[STEP_ADD]], +; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i8> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP1]] +; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 -3 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[REVERSE]], <4 x i32>* [[TMP10]], align 4 +; CHECK-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 -4 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 -3 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[REVERSE4]], <4 x i32>* [[TMP13]], align 4 +; CHECK-NEXT: [[OFFSET_IDX5:%.*]] = sub i64 1023, [[INDEX]] +; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[OFFSET_IDX5]] to i32 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> undef, i32 [[TMP14]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION8:%.*]] = add <4 x i32> [[BROADCAST_SPLAT7]], +; CHECK-NEXT: [[INDUCTION9:%.*]] = add <4 x i32> [[BROADCAST_SPLAT7]], +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], -4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[STEP_ADD]], +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10 +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024 +; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i8 [ -127, [[MIDDLE_BLOCK]] ], [ -127, [[ENTRY]] ] +; CHECK-NEXT: br label [[WHILE_BODY:%.*]] +; CHECK: while.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[WHILE_BODY]] ] +; CHECK-NEXT: [[FORWARD_INDUCTION_05:%.*]] = phi i8 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[WHILE_BODY]] ] +; CHECK-NEXT: [[INC]] = add i8 [[FORWARD_INDUCTION_05]], 1 +; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[INC]] to i32 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[CONV]], i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], -1 +; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP18]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[WHILE_BODY]], label [[WHILE_END]], !llvm.loop !11 +; CHECK: while.end: +; CHECK-NEXT: ret void +; entry: br label %while.body