diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -679,8 +679,14 @@ properlyDominates(Previous, SinkCandidate, VPDT)) return true; - if (SinkCandidate->mayHaveSideEffects()) - return false; + if (SinkCandidate->mayHaveSideEffects()) { + if (!isa(SinkCandidate)) + return false; + if (any_of(make_range(std::next(SinkCandidate->getIterator()), + std::next(Previous->getIterator())), + [&](VPRecipeBase &R) { return R.mayReadOrWriteMemory(); })) + return false; + } WorkList.push_back(SinkCandidate); return true; @@ -690,6 +696,8 @@ WorkList.push_back(FOR); for (unsigned I = 0; I != WorkList.size(); ++I) { VPRecipeBase *Current = WorkList[I]; + if (Current->getNumDefinedValues() == 0) + continue; assert(Current->getNumDefinedValues() == 1 && "only recipes with a single defined value expected"); for (VPUser *User : Current->getVPSingleValue()->users()) { diff --git a/llvm/test/Transforms/LoopVectorize/fixed-order-recurrences-memory-instructions.ll b/llvm/test/Transforms/LoopVectorize/fixed-order-recurrences-memory-instructions.ll --- a/llvm/test/Transforms/LoopVectorize/fixed-order-recurrences-memory-instructions.ll +++ b/llvm/test/Transforms/LoopVectorize/fixed-order-recurrences-memory-instructions.ll @@ -85,16 +85,40 @@ define void @sink_store_that_uses_for_past_instructions(ptr noalias %A, ptr noalias %B) { ; CHECK-LABEL: @sink_store_that_uses_for_past_instructions( ; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2]] = add <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr [[TMP4]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 1000, 1000 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[FOR:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[IV]] -; CHECK-NEXT: store i32 [[FOR]], ptr [[GEP_A]], align 4 +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]] +; CHECK-NEXT: store i32 [[SCALAR_RECUR]], ptr [[GEP_A]], align 4 ; CHECK-NEXT: [[FOR_NEXT]] = add i32 [[IV]], 2 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 1000 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll --- a/llvm/test/Transforms/LoopVectorize/induction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction.ll @@ -3490,50 +3490,49 @@ ; CHECK-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] ; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LEN]], 255 ; CHECK-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[LEN]] to i8 -; CHECK-NEXT: [[TMP7:%.*]] = add i8 [[T]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i8 [[TMP7]], [[T]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp ugt i32 [[LEN]], 255 -; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = or i1 [[TMP5]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[TMP5]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]] ; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; CHECK-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[DOTCAST]] -; CHECK-NEXT: [[IND_END2:%.*]] = add i32 [[EXT]], [[N_VEC]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[T]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], +; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i32 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[DOTCAST4:%.*]] = trunc i32 [[INDEX]] to i8 -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST4]] -; CHECK-NEXT: [[TMP12:%.*]] = add i8 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 -; CHECK-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP14]], align 4 +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[DOTCAST2:%.*]] = trunc i32 [[INDEX]] to i8 +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST2]] +; CHECK-NEXT: [[TMP6:%.*]] = add i8 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i8> [[VEC_IND]], +; CHECK-NEXT: [[TMP9]] = zext <2 x i8> [[TMP8]] to <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP9]], <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; CHECK-NEXT: store <2 x i32> [[TMP10]], ptr [[TMP11]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i8> [[VEC_IND]], +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[TMP9]], i32 1 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[EXT]], [[VECTOR_SCEVCHECK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ], [ [[EXT]], [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[SPHI:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[IDX]] -; CHECK-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 +; CHECK-NEXT: store i32 [[SCALAR_RECUR]], ptr [[PTR]], align 4 ; CHECK-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; CHECK-NEXT: [[IDX_INC_EXT]] = zext i8 [[IDX_INC]] to i32 ; CHECK-NEXT: [[IDX_B_INC]] = add nuw nsw i32 [[IDX_B]], 1 @@ -3557,49 +3556,50 @@ ; IND-NEXT: [[TMP1:%.*]] = trunc i32 [[LEN]] to i8 ; IND-NEXT: [[TMP2:%.*]] = add i8 [[TMP1]], [[T]] ; IND-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] -; IND-NEXT: [[TMP4:%.*]] = trunc i32 [[LEN]] to i8 -; IND-NEXT: [[TMP5:%.*]] = xor i8 [[T]], -1 -; IND-NEXT: [[TMP6:%.*]] = icmp ult i8 [[TMP5]], [[TMP4]] -; IND-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[LEN]], 255 -; IND-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] -; IND-NEXT: [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]] -; IND-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; IND-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LEN]], 255 +; IND-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; IND-NEXT: br i1 [[TMP5]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; IND: vector.ph: ; IND-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], -2 ; IND-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; IND-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] -; IND-NEXT: [[IND_END2:%.*]] = add i32 [[N_VEC]], [[EXT]] -; IND-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0 -; IND-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; IND-NEXT: [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], +; IND-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[T]], i64 0 +; IND-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer +; IND-NEXT: [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], +; IND-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 1 ; IND-NEXT: br label [[VECTOR_BODY:%.*]] ; IND: vector.body: ; IND-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; IND-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; IND-NEXT: [[DOTCAST4:%.*]] = trunc i32 [[INDEX]] to i8 -; IND-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST4]], [[T]] -; IND-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 -; IND-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] -; IND-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 +; IND-NEXT: [[VEC_IND:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; IND-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; IND-NEXT: [[DOTCAST2:%.*]] = trunc i32 [[INDEX]] to i8 +; IND-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST2]], [[T]] +; IND-NEXT: [[TMP6:%.*]] = sext i8 [[OFFSET_IDX]] to i64 +; IND-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; IND-NEXT: [[TMP8:%.*]] = add <2 x i8> [[VEC_IND]], +; IND-NEXT: [[TMP9]] = zext <2 x i8> [[TMP8]] to <2 x i32> +; IND-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP9]], <2 x i32> +; IND-NEXT: store <2 x i32> [[TMP10]], ptr [[TMP7]], align 4 ; IND-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; IND-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; IND-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i8> [[VEC_IND]], +; IND-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; IND-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] ; IND: middle.block: ; IND-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] +; IND-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[TMP9]], i64 1 ; IND-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; IND: scalar.ph: -; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] -; IND-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] -; IND-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ], [ [[EXT]], [[VECTOR_SCEVCHECK]] ] +; IND-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[EXT]], [[VECTOR_SCEVCHECK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[T]], [[VECTOR_SCEVCHECK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] +; IND-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[LOOP_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] ; IND-NEXT: br label [[LOOP:%.*]] ; IND: loop: ; IND-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; IND-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] -; IND-NEXT: [[SPHI:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] -; IND-NEXT: [[TMP13:%.*]] = sext i8 [[IDX]] to i64 -; IND-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]] -; IND-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 +; IND-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] +; IND-NEXT: [[TMP12:%.*]] = sext i8 [[IDX]] to i64 +; IND-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]] +; IND-NEXT: store i32 [[SCALAR_RECUR]], ptr [[PTR]], align 4 ; IND-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; IND-NEXT: [[IDX_INC_EXT]] = zext i8 [[IDX_INC]] to i32 ; IND-NEXT: [[IDX_B_INC]] = add nuw nsw i32 [[IDX_B]], 1 @@ -3623,52 +3623,55 @@ ; UNROLL-NEXT: [[TMP1:%.*]] = trunc i32 [[LEN]] to i8 ; UNROLL-NEXT: [[TMP2:%.*]] = add i8 [[TMP1]], [[T]] ; UNROLL-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] -; UNROLL-NEXT: [[TMP4:%.*]] = trunc i32 [[LEN]] to i8 -; UNROLL-NEXT: [[TMP5:%.*]] = xor i8 [[T]], -1 -; UNROLL-NEXT: [[TMP6:%.*]] = icmp ult i8 [[TMP5]], [[TMP4]] -; UNROLL-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[LEN]], 255 -; UNROLL-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] -; UNROLL-NEXT: [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]] -; UNROLL-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; UNROLL-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LEN]], 255 +; UNROLL-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; UNROLL-NEXT: br i1 [[TMP5]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL: vector.ph: ; UNROLL-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], -4 ; UNROLL-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; UNROLL-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] -; UNROLL-NEXT: [[IND_END2:%.*]] = add i32 [[N_VEC]], [[EXT]] -; UNROLL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0 -; UNROLL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; UNROLL-NEXT: [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], +; UNROLL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[T]], i64 0 +; UNROLL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer +; UNROLL-NEXT: [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], +; UNROLL-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 1 ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: ; UNROLL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NEXT: [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8 -; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]] -; UNROLL-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 -; UNROLL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] -; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 -; UNROLL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 2 -; UNROLL-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4 +; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NEXT: [[DOTCAST3:%.*]] = trunc i32 [[INDEX]] to i8 +; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST3]], [[T]] +; UNROLL-NEXT: [[TMP6:%.*]] = sext i8 [[OFFSET_IDX]] to i64 +; UNROLL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; UNROLL-NEXT: [[TMP8:%.*]] = add <2 x i8> [[VEC_IND]], +; UNROLL-NEXT: [[TMP9:%.*]] = add <2 x i8> [[VEC_IND]], +; UNROLL-NEXT: [[TMP10:%.*]] = zext <2 x i8> [[TMP8]] to <2 x i32> +; UNROLL-NEXT: [[TMP11]] = zext <2 x i8> [[TMP9]] to <2 x i32> +; UNROLL-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP10]], <2 x i32> +; UNROLL-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> +; UNROLL-NEXT: store <2 x i32> [[TMP12]], ptr [[TMP7]], align 4 +; UNROLL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 2 +; UNROLL-NEXT: store <2 x i32> [[TMP13]], ptr [[TMP14]], align 4 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i8> [[VEC_IND]], +; UNROLL-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] ; UNROLL: middle.block: ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] +; UNROLL-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[TMP11]], i64 1 ; UNROLL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: -; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] -; UNROLL-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] -; UNROLL-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ], [ [[EXT]], [[VECTOR_SCEVCHECK]] ] +; UNROLL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[EXT]], [[VECTOR_SCEVCHECK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[T]], [[VECTOR_SCEVCHECK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] +; UNROLL-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[LOOP_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] ; UNROLL-NEXT: br label [[LOOP:%.*]] ; UNROLL: loop: ; UNROLL-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; UNROLL-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] -; UNROLL-NEXT: [[SPHI:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] -; UNROLL-NEXT: [[TMP14:%.*]] = sext i8 [[IDX]] to i64 -; UNROLL-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] -; UNROLL-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 +; UNROLL-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] +; UNROLL-NEXT: [[TMP16:%.*]] = sext i8 [[IDX]] to i64 +; UNROLL-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]] +; UNROLL-NEXT: store i32 [[SCALAR_RECUR]], ptr [[PTR]], align 4 ; UNROLL-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; UNROLL-NEXT: [[IDX_INC_EXT]] = zext i8 [[IDX_INC]] to i32 ; UNROLL-NEXT: [[IDX_B_INC]] = add nuw nsw i32 [[IDX_B]], 1 @@ -3695,55 +3698,57 @@ ; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LEN]], 255 ; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] -; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = trunc i32 [[LEN]] to i8 -; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = add i8 [[T]], [[TMP6]] -; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = icmp ult i8 [[TMP7]], [[T]] -; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = icmp ugt i32 [[LEN]], 255 -; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] -; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = or i1 [[TMP5]], [[TMP10]] -; UNROLL-NO-IC-NEXT: br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP5]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-IC: vector.ph: ; UNROLL-NO-IC-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 4 ; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]] ; UNROLL-NO-IC-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; UNROLL-NO-IC-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[DOTCAST]] -; UNROLL-NO-IC-NEXT: [[IND_END2:%.*]] = add i32 [[EXT]], [[N_VEC]] -; UNROLL-NO-IC-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0 -; UNROLL-NO-IC-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; UNROLL-NO-IC-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], +; UNROLL-NO-IC-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[T]], i64 0 +; UNROLL-NO-IC-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer +; UNROLL-NO-IC-NEXT: [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i32 1 ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NO-IC-NEXT: [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8 -; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST5]] -; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = add i8 [[OFFSET_IDX]], 0 -; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = add i8 [[OFFSET_IDX]], 2 -; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP12]] -; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[TMP13]] -; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0 -; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP16]], align 4 -; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 2 -; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP17]], align 4 +; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <2 x i8> [[VEC_IND]], +; UNROLL-NO-IC-NEXT: [[DOTCAST3:%.*]] = trunc i32 [[INDEX]] to i8 +; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST3]] +; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = add i8 [[OFFSET_IDX]], 0 +; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = add i8 [[OFFSET_IDX]], 2 +; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP6]] +; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[TMP7]] +; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = add <2 x i8> [[VEC_IND]], +; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = add <2 x i8> [[STEP_ADD]], +; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = zext <2 x i8> [[TMP10]] to <2 x i32> +; UNROLL-NO-IC-NEXT: [[TMP13]] = zext <2 x i8> [[TMP11]] to <2 x i32> +; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP12]], <2 x i32> +; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = shufflevector <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> +; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[TMP14]], ptr [[TMP16]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 2 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[TMP15]], ptr [[TMP17]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], +; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i8> [[STEP_ADD]], ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1 ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: +; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[EXT]], [[VECTOR_SCEVCHECK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ], [ [[EXT]], [[VECTOR_SCEVCHECK]] ] ; UNROLL-NO-IC-NEXT: br label [[LOOP:%.*]] ; UNROLL-NO-IC: loop: ; UNROLL-NO-IC-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; UNROLL-NO-IC-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] -; UNROLL-NO-IC-NEXT: [[SPHI:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] ; UNROLL-NO-IC-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[IDX]] -; UNROLL-NO-IC-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 +; UNROLL-NO-IC-NEXT: store i32 [[SCALAR_RECUR]], ptr [[PTR]], align 4 ; UNROLL-NO-IC-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; UNROLL-NO-IC-NEXT: [[IDX_INC_EXT]] = zext i8 [[IDX_INC]] to i32 ; UNROLL-NO-IC-NEXT: [[IDX_B_INC]] = add nuw nsw i32 [[IDX_B]], 1 @@ -3767,52 +3772,55 @@ ; INTERLEAVE-NEXT: [[TMP1:%.*]] = trunc i32 [[LEN]] to i8 ; INTERLEAVE-NEXT: [[TMP2:%.*]] = add i8 [[TMP1]], [[T]] ; INTERLEAVE-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] -; INTERLEAVE-NEXT: [[TMP4:%.*]] = trunc i32 [[LEN]] to i8 -; INTERLEAVE-NEXT: [[TMP5:%.*]] = xor i8 [[T]], -1 -; INTERLEAVE-NEXT: [[TMP6:%.*]] = icmp ult i8 [[TMP5]], [[TMP4]] -; INTERLEAVE-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[LEN]], 255 -; INTERLEAVE-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] -; INTERLEAVE-NEXT: [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]] -; INTERLEAVE-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; INTERLEAVE-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LEN]], 255 +; INTERLEAVE-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; INTERLEAVE-NEXT: br i1 [[TMP5]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; INTERLEAVE: vector.ph: ; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], -8 ; INTERLEAVE-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; INTERLEAVE-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] -; INTERLEAVE-NEXT: [[IND_END2:%.*]] = add i32 [[N_VEC]], [[EXT]] -; INTERLEAVE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[EXT]], i64 0 -; INTERLEAVE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; INTERLEAVE-NEXT: [[INDUCTION:%.*]] = add nuw nsw <4 x i32> [[DOTSPLAT]], +; INTERLEAVE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[T]], i64 0 +; INTERLEAVE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer +; INTERLEAVE-NEXT: [[INDUCTION:%.*]] = add <4 x i8> [[DOTSPLAT]], +; INTERLEAVE-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[EXT]], i64 3 ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], -; INTERLEAVE-NEXT: [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8 -; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]] -; INTERLEAVE-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 -; INTERLEAVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] -; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 -; INTERLEAVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 4 -; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4 +; INTERLEAVE-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[DOTCAST3:%.*]] = trunc i32 [[INDEX]] to i8 +; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST3]], [[T]] +; INTERLEAVE-NEXT: [[TMP6:%.*]] = sext i8 [[OFFSET_IDX]] to i64 +; INTERLEAVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; INTERLEAVE-NEXT: [[TMP8:%.*]] = add <4 x i8> [[VEC_IND]], +; INTERLEAVE-NEXT: [[TMP9:%.*]] = add <4 x i8> [[VEC_IND]], +; INTERLEAVE-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[TMP8]] to <4 x i32> +; INTERLEAVE-NEXT: [[TMP11]] = zext <4 x i8> [[TMP9]] to <4 x i32> +; INTERLEAVE-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP10]], <4 x i32> +; INTERLEAVE-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> +; INTERLEAVE-NEXT: store <4 x i32> [[TMP12]], ptr [[TMP7]], align 4 +; INTERLEAVE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 4 +; INTERLEAVE-NEXT: store <4 x i32> [[TMP13]], ptr [[TMP14]], align 4 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 -; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], -; INTERLEAVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; INTERLEAVE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], +; INTERLEAVE-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; INTERLEAVE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] ; INTERLEAVE: middle.block: ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] +; INTERLEAVE-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP11]], i64 3 ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: -; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] -; INTERLEAVE-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] -; INTERLEAVE-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ], [ [[EXT]], [[VECTOR_SCEVCHECK]] ] +; INTERLEAVE-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[EXT]], [[VECTOR_SCEVCHECK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[T]], [[VECTOR_SCEVCHECK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] +; INTERLEAVE-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[LOOP_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] ; INTERLEAVE-NEXT: br label [[LOOP:%.*]] ; INTERLEAVE: loop: ; INTERLEAVE-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; INTERLEAVE-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] -; INTERLEAVE-NEXT: [[SPHI:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] -; INTERLEAVE-NEXT: [[TMP14:%.*]] = sext i8 [[IDX]] to i64 -; INTERLEAVE-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] -; INTERLEAVE-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 +; INTERLEAVE-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] +; INTERLEAVE-NEXT: [[TMP16:%.*]] = sext i8 [[IDX]] to i64 +; INTERLEAVE-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]] +; INTERLEAVE-NEXT: store i32 [[SCALAR_RECUR]], ptr [[PTR]], align 4 ; INTERLEAVE-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; INTERLEAVE-NEXT: [[IDX_INC_EXT]] = zext i8 [[IDX_INC]] to i32 ; INTERLEAVE-NEXT: [[IDX_B_INC]] = add nuw nsw i32 [[IDX_B]], 1 @@ -3871,51 +3879,50 @@ ; CHECK-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] ; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LEN]], 255 ; CHECK-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[LEN]] to i8 -; CHECK-NEXT: [[TMP7:%.*]] = add i8 [[T]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i8 [[TMP7]], [[T]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp ugt i32 [[LEN]], 255 -; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = or i1 [[TMP5]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[TMP5]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]] ; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; CHECK-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[DOTCAST]] -; CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[N_VEC]], 4 -; CHECK-NEXT: [[IND_END2:%.*]] = add i32 [[EXT_MUL]], [[TMP12]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[T]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], +; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i32 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[DOTCAST4:%.*]] = trunc i32 [[INDEX]] to i8 -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST4]] -; CHECK-NEXT: [[TMP13:%.*]] = add i8 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0 -; CHECK-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP15]], align 4 +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[DOTCAST2:%.*]] = trunc i32 [[INDEX]] to i8 +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST2]] +; CHECK-NEXT: [[TMP6:%.*]] = add i8 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i8> [[VEC_IND]], +; CHECK-NEXT: [[TMP9:%.*]] = zext <2 x i8> [[TMP8]] to <2 x i32> +; CHECK-NEXT: [[TMP10]] = mul <2 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP10]], <2 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; CHECK-NEXT: store <2 x i32> [[TMP11]], ptr [[TMP12]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i8> [[VEC_IND]], +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[IDX]] -; CHECK-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 +; CHECK-NEXT: store i32 [[SCALAR_RECUR]], ptr [[PTR]], align 4 ; CHECK-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; CHECK-NEXT: [[IDX_INC_EXT:%.*]] = zext i8 [[IDX_INC]] to i32 ; CHECK-NEXT: [[MUL]] = mul i32 [[IDX_INC_EXT]], 4 @@ -3941,50 +3948,51 @@ ; IND-NEXT: [[TMP1:%.*]] = trunc i32 [[LEN]] to i8 ; IND-NEXT: [[TMP2:%.*]] = add i8 [[TMP1]], [[T]] ; IND-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] -; IND-NEXT: [[TMP4:%.*]] = trunc i32 [[LEN]] to i8 -; IND-NEXT: [[TMP5:%.*]] = xor i8 [[T]], -1 -; IND-NEXT: [[TMP6:%.*]] = icmp ult i8 [[TMP5]], [[TMP4]] -; IND-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[LEN]], 255 -; IND-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] -; IND-NEXT: [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]] -; IND-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; IND-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LEN]], 255 +; IND-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; IND-NEXT: br i1 [[TMP5]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; IND: vector.ph: ; IND-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], -2 ; IND-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; IND-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] -; IND-NEXT: [[EXT_MUL5:%.*]] = add i32 [[N_VEC]], [[EXT]] -; IND-NEXT: [[IND_END2:%.*]] = shl i32 [[EXT_MUL5]], 2 -; IND-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0 -; IND-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; IND-NEXT: [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], +; IND-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[T]], i64 0 +; IND-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer +; IND-NEXT: [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], +; IND-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 1 ; IND-NEXT: br label [[VECTOR_BODY:%.*]] ; IND: vector.body: ; IND-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; IND-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; IND-NEXT: [[DOTCAST4:%.*]] = trunc i32 [[INDEX]] to i8 -; IND-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST4]], [[T]] -; IND-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 -; IND-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] -; IND-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 +; IND-NEXT: [[VEC_IND:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; IND-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; IND-NEXT: [[DOTCAST2:%.*]] = trunc i32 [[INDEX]] to i8 +; IND-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST2]], [[T]] +; IND-NEXT: [[TMP6:%.*]] = sext i8 [[OFFSET_IDX]] to i64 +; IND-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; IND-NEXT: [[TMP8:%.*]] = add <2 x i8> [[VEC_IND]], +; IND-NEXT: [[TMP9:%.*]] = zext <2 x i8> [[TMP8]] to <2 x i32> +; IND-NEXT: [[TMP10]] = shl nuw nsw <2 x i32> [[TMP9]], +; IND-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP10]], <2 x i32> +; IND-NEXT: store <2 x i32> [[TMP11]], ptr [[TMP7]], align 4 ; IND-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], +; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i8> [[VEC_IND]], ; IND-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; IND-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] ; IND: middle.block: ; IND-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] +; IND-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[TMP10]], i64 1 ; IND-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; IND: scalar.ph: -; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] -; IND-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] -; IND-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ] +; IND-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[T]], [[VECTOR_SCEVCHECK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] +; IND-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[LOOP_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] ; IND-NEXT: br label [[LOOP:%.*]] ; IND: loop: ; IND-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; IND-NEXT: [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; IND-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] ; IND-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; IND-NEXT: [[TMP13:%.*]] = sext i8 [[IDX]] to i64 ; IND-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]] -; IND-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 +; IND-NEXT: store i32 [[SCALAR_RECUR]], ptr [[PTR]], align 4 ; IND-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; IND-NEXT: [[IDX_INC_EXT:%.*]] = zext i8 [[IDX_INC]] to i32 ; IND-NEXT: [[MUL]] = shl nuw nsw i32 [[IDX_INC_EXT]], 2 @@ -4010,53 +4018,57 @@ ; UNROLL-NEXT: [[TMP1:%.*]] = trunc i32 [[LEN]] to i8 ; UNROLL-NEXT: [[TMP2:%.*]] = add i8 [[TMP1]], [[T]] ; UNROLL-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] -; UNROLL-NEXT: [[TMP4:%.*]] = trunc i32 [[LEN]] to i8 -; UNROLL-NEXT: [[TMP5:%.*]] = xor i8 [[T]], -1 -; UNROLL-NEXT: [[TMP6:%.*]] = icmp ult i8 [[TMP5]], [[TMP4]] -; UNROLL-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[LEN]], 255 -; UNROLL-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] -; UNROLL-NEXT: [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]] -; UNROLL-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; UNROLL-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LEN]], 255 +; UNROLL-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; UNROLL-NEXT: br i1 [[TMP5]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL: vector.ph: ; UNROLL-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], -4 ; UNROLL-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; UNROLL-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] -; UNROLL-NEXT: [[EXT_MUL6:%.*]] = add i32 [[N_VEC]], [[EXT]] -; UNROLL-NEXT: [[IND_END2:%.*]] = shl i32 [[EXT_MUL6]], 2 -; UNROLL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0 -; UNROLL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; UNROLL-NEXT: [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], +; UNROLL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[T]], i64 0 +; UNROLL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer +; UNROLL-NEXT: [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], +; UNROLL-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 1 ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: ; UNROLL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NEXT: [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8 -; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]] -; UNROLL-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 -; UNROLL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] -; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 -; UNROLL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 2 -; UNROLL-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4 +; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NEXT: [[DOTCAST3:%.*]] = trunc i32 [[INDEX]] to i8 +; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST3]], [[T]] +; UNROLL-NEXT: [[TMP6:%.*]] = sext i8 [[OFFSET_IDX]] to i64 +; UNROLL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; UNROLL-NEXT: [[TMP8:%.*]] = add <2 x i8> [[VEC_IND]], +; UNROLL-NEXT: [[TMP9:%.*]] = add <2 x i8> [[VEC_IND]], +; UNROLL-NEXT: [[TMP10:%.*]] = zext <2 x i8> [[TMP8]] to <2 x i32> +; UNROLL-NEXT: [[TMP11:%.*]] = zext <2 x i8> [[TMP9]] to <2 x i32> +; UNROLL-NEXT: [[TMP12:%.*]] = shl nuw nsw <2 x i32> [[TMP10]], +; UNROLL-NEXT: [[TMP13]] = shl nuw nsw <2 x i32> [[TMP11]], +; UNROLL-NEXT: [[TMP14:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP12]], <2 x i32> +; UNROLL-NEXT: [[TMP15:%.*]] = shufflevector <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> +; UNROLL-NEXT: store <2 x i32> [[TMP14]], ptr [[TMP7]], align 4 +; UNROLL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 2 +; UNROLL-NEXT: store <2 x i32> [[TMP15]], ptr [[TMP16]], align 4 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i8> [[VEC_IND]], +; UNROLL-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] ; UNROLL: middle.block: ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] +; UNROLL-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[TMP13]], i64 1 ; UNROLL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: -; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] -; UNROLL-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] -; UNROLL-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ] +; UNROLL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[T]], [[VECTOR_SCEVCHECK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] +; UNROLL-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[LOOP_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] ; UNROLL-NEXT: br label [[LOOP:%.*]] ; UNROLL: loop: ; UNROLL-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; UNROLL-NEXT: [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; UNROLL-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] ; UNROLL-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] -; UNROLL-NEXT: [[TMP14:%.*]] = sext i8 [[IDX]] to i64 -; UNROLL-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] -; UNROLL-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 +; UNROLL-NEXT: [[TMP18:%.*]] = sext i8 [[IDX]] to i64 +; UNROLL-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]] +; UNROLL-NEXT: store i32 [[SCALAR_RECUR]], ptr [[PTR]], align 4 ; UNROLL-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; UNROLL-NEXT: [[IDX_INC_EXT:%.*]] = zext i8 [[IDX_INC]] to i32 ; UNROLL-NEXT: [[MUL]] = shl nuw nsw i32 [[IDX_INC_EXT]], 2 @@ -4085,56 +4097,59 @@ ; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LEN]], 255 ; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] -; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = trunc i32 [[LEN]] to i8 -; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = add i8 [[T]], [[TMP6]] -; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = icmp ult i8 [[TMP7]], [[T]] -; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = icmp ugt i32 [[LEN]], 255 -; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] -; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = or i1 [[TMP5]], [[TMP10]] -; UNROLL-NO-IC-NEXT: br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP5]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-IC: vector.ph: ; UNROLL-NO-IC-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 4 ; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]] ; UNROLL-NO-IC-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; UNROLL-NO-IC-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[DOTCAST]] -; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = mul i32 [[N_VEC]], 4 -; UNROLL-NO-IC-NEXT: [[IND_END2:%.*]] = add i32 [[EXT_MUL]], [[TMP12]] -; UNROLL-NO-IC-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0 -; UNROLL-NO-IC-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; UNROLL-NO-IC-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], +; UNROLL-NO-IC-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[T]], i64 0 +; UNROLL-NO-IC-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer +; UNROLL-NO-IC-NEXT: [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i32 1 ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NO-IC-NEXT: [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8 -; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST5]] -; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = add i8 [[OFFSET_IDX]], 0 -; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = add i8 [[OFFSET_IDX]], 2 -; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP13]] -; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[TMP14]] -; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 -; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP17]], align 4 -; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 2 -; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP18]], align 4 +; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <2 x i8> [[VEC_IND]], +; UNROLL-NO-IC-NEXT: [[DOTCAST3:%.*]] = trunc i32 [[INDEX]] to i8 +; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST3]] +; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = add i8 [[OFFSET_IDX]], 0 +; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = add i8 [[OFFSET_IDX]], 2 +; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP6]] +; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[TMP7]] +; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = add <2 x i8> [[VEC_IND]], +; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = add <2 x i8> [[STEP_ADD]], +; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = zext <2 x i8> [[TMP10]] to <2 x i32> +; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = zext <2 x i8> [[TMP11]] to <2 x i32> +; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = mul <2 x i32> [[TMP12]], +; UNROLL-NO-IC-NEXT: [[TMP15]] = mul <2 x i32> [[TMP13]], +; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP14]], <2 x i32> +; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = shufflevector <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], <2 x i32> +; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[TMP16]], ptr [[TMP18]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 2 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[TMP17]], ptr [[TMP19]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], -; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-IC-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i8> [[STEP_ADD]], +; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1 ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: +; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ] ; UNROLL-NO-IC-NEXT: br label [[LOOP:%.*]] ; UNROLL-NO-IC: loop: ; UNROLL-NO-IC-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; UNROLL-NO-IC-NEXT: [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] ; UNROLL-NO-IC-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; UNROLL-NO-IC-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[IDX]] -; UNROLL-NO-IC-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 +; UNROLL-NO-IC-NEXT: store i32 [[SCALAR_RECUR]], ptr [[PTR]], align 4 ; UNROLL-NO-IC-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; UNROLL-NO-IC-NEXT: [[IDX_INC_EXT:%.*]] = zext i8 [[IDX_INC]] to i32 ; UNROLL-NO-IC-NEXT: [[MUL]] = mul i32 [[IDX_INC_EXT]], 4 @@ -4160,53 +4175,57 @@ ; INTERLEAVE-NEXT: [[TMP1:%.*]] = trunc i32 [[LEN]] to i8 ; INTERLEAVE-NEXT: [[TMP2:%.*]] = add i8 [[TMP1]], [[T]] ; INTERLEAVE-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] -; INTERLEAVE-NEXT: [[TMP4:%.*]] = trunc i32 [[LEN]] to i8 -; INTERLEAVE-NEXT: [[TMP5:%.*]] = xor i8 [[T]], -1 -; INTERLEAVE-NEXT: [[TMP6:%.*]] = icmp ult i8 [[TMP5]], [[TMP4]] -; INTERLEAVE-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[LEN]], 255 -; INTERLEAVE-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] -; INTERLEAVE-NEXT: [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]] -; INTERLEAVE-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; INTERLEAVE-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LEN]], 255 +; INTERLEAVE-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; INTERLEAVE-NEXT: br i1 [[TMP5]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; INTERLEAVE: vector.ph: ; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], -8 ; INTERLEAVE-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; INTERLEAVE-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] -; INTERLEAVE-NEXT: [[EXT_MUL6:%.*]] = add i32 [[N_VEC]], [[EXT]] -; INTERLEAVE-NEXT: [[IND_END2:%.*]] = shl i32 [[EXT_MUL6]], 2 -; INTERLEAVE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[EXT_MUL]], i64 0 -; INTERLEAVE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; INTERLEAVE-NEXT: [[INDUCTION:%.*]] = add nuw nsw <4 x i32> [[DOTSPLAT]], +; INTERLEAVE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[T]], i64 0 +; INTERLEAVE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer +; INTERLEAVE-NEXT: [[INDUCTION:%.*]] = add <4 x i8> [[DOTSPLAT]], +; INTERLEAVE-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[EXT_MUL]], i64 3 ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], -; INTERLEAVE-NEXT: [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8 -; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]] -; INTERLEAVE-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 -; INTERLEAVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] -; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 -; INTERLEAVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 4 -; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4 +; INTERLEAVE-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[DOTCAST3:%.*]] = trunc i32 [[INDEX]] to i8 +; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST3]], [[T]] +; INTERLEAVE-NEXT: [[TMP6:%.*]] = sext i8 [[OFFSET_IDX]] to i64 +; INTERLEAVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; INTERLEAVE-NEXT: [[TMP8:%.*]] = add <4 x i8> [[VEC_IND]], +; INTERLEAVE-NEXT: [[TMP9:%.*]] = add <4 x i8> [[VEC_IND]], +; INTERLEAVE-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[TMP8]] to <4 x i32> +; INTERLEAVE-NEXT: [[TMP11:%.*]] = zext <4 x i8> [[TMP9]] to <4 x i32> +; INTERLEAVE-NEXT: [[TMP12:%.*]] = shl nuw nsw <4 x i32> [[TMP10]], +; INTERLEAVE-NEXT: [[TMP13]] = shl nuw nsw <4 x i32> [[TMP11]], +; INTERLEAVE-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP12]], <4 x i32> +; INTERLEAVE-NEXT: [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> +; INTERLEAVE-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP7]], align 4 +; INTERLEAVE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 4 +; INTERLEAVE-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP16]], align 4 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 -; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], -; INTERLEAVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; INTERLEAVE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], +; INTERLEAVE-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; INTERLEAVE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] ; INTERLEAVE: middle.block: ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] +; INTERLEAVE-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP13]], i64 3 ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: -; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] -; INTERLEAVE-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] -; INTERLEAVE-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ] +; INTERLEAVE-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[T]], [[VECTOR_SCEVCHECK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] +; INTERLEAVE-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[LOOP_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] ; INTERLEAVE-NEXT: br label [[LOOP:%.*]] ; INTERLEAVE: loop: ; INTERLEAVE-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; INTERLEAVE-NEXT: [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; INTERLEAVE-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] ; INTERLEAVE-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] -; INTERLEAVE-NEXT: [[TMP14:%.*]] = sext i8 [[IDX]] to i64 -; INTERLEAVE-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] -; INTERLEAVE-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 +; INTERLEAVE-NEXT: [[TMP18:%.*]] = sext i8 [[IDX]] to i64 +; INTERLEAVE-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]] +; INTERLEAVE-NEXT: store i32 [[SCALAR_RECUR]], ptr [[PTR]], align 4 ; INTERLEAVE-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; INTERLEAVE-NEXT: [[IDX_INC_EXT:%.*]] = zext i8 [[IDX_INC]] to i32 ; INTERLEAVE-NEXT: [[MUL]] = shl nuw nsw i32 [[IDX_INC_EXT]], 2 diff --git a/llvm/test/Transforms/LoopVectorize/pr33706.ll b/llvm/test/Transforms/LoopVectorize/pr33706.ll --- a/llvm/test/Transforms/LoopVectorize/pr33706.ll +++ b/llvm/test/Transforms/LoopVectorize/pr33706.ll @@ -20,24 +20,80 @@ ; CHECK-NEXT: br label [[BB27:%.*]] ; CHECK: bb9: ; CHECK-NEXT: [[VAR_TMP10:%.*]] = udiv i32 65536, [[ARG2]] +; CHECK-NEXT: [[TMP0:%.*]] = add nsw i32 [[VAR_TMP10]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[N_VEC]], 4 +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[VAR_TMP4]], i64 [[TMP3]] +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END1:%.*]] = sub i32 [[VAR_TMP10]], [[DOTCAST]] +; CHECK-NEXT: [[DOTCAST3:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = mul i32 [[DOTCAST3]], [[ARG2]] +; CHECK-NEXT: [[IND_END4:%.*]] = add i32 [[TMP]], [[TMP4]] +; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <2 x i32> poison, i32 [[VAR_TMP5]], i32 1 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[DOTSPLATINSERT6:%.*]] = insertelement <2 x i32> poison, i32 [[ARG2]], i64 0 +; CHECK-NEXT: [[DOTSPLAT7:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT6]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = mul <2 x i32> , [[DOTSPLAT7]] +; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[ARG2]], 2 +; CHECK-NEXT: [[DOTSPLATINSERT8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP6]], i64 0 +; CHECK-NEXT: [[DOTSPLAT9:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT8]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[ARG2]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[VAR_TMP4]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[NEXT_GEP]], i64 1 +; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP11]] = and <2 x i32> [[TMP10]], +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP11]], <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = sitofp <2 x i32> [[TMP12]] to <2 x float> +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr float, ptr [[NEXT_GEP]], i32 0 +; CHECK-NEXT: store <2 x float> [[TMP13]], ptr [[TMP14]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], [[DOTSPLAT9]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1 +; CHECK-NEXT: br i1 [[CMP_N]], label [[BB22:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VAR_TMP5]], [[BB9]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[VAR_TMP4]], [[BB9]] ] +; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[VAR_TMP10]], [[BB9]] ] +; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[IND_END4]], [[MIDDLE_BLOCK]] ], [ [[TMP]], [[BB9]] ] ; CHECK-NEXT: br label [[BB11:%.*]] ; CHECK: bb11: -; CHECK-NEXT: [[VAR_TMP12:%.*]] = phi i32 [ [[VAR_TMP20:%.*]], [[BB11]] ], [ [[VAR_TMP5]], [[BB9]] ] -; CHECK-NEXT: [[VAR_TMP13:%.*]] = phi ptr [ [[VAR_TMP18:%.*]], [[BB11]] ], [ [[VAR_TMP4]], [[BB9]] ] -; CHECK-NEXT: [[VAR_TMP14:%.*]] = phi i32 [ [[VAR_TMP16:%.*]], [[BB11]] ], [ [[VAR_TMP10]], [[BB9]] ] -; CHECK-NEXT: [[VAR_TMP15:%.*]] = phi i32 [ [[VAR_TMP19:%.*]], [[BB11]] ], [ [[TMP]], [[BB9]] ] +; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[VAR_TMP20:%.*]], [[BB11]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[VAR_TMP13:%.*]] = phi ptr [ [[VAR_TMP18:%.*]], [[BB11]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[VAR_TMP14:%.*]] = phi i32 [ [[VAR_TMP16:%.*]], [[BB11]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[VAR_TMP15:%.*]] = phi i32 [ [[VAR_TMP19:%.*]], [[BB11]] ], [ [[BC_RESUME_VAL5]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[VAR_TMP16]] = add nsw i32 [[VAR_TMP14]], -1 -; CHECK-NEXT: [[VAR_TMP17:%.*]] = sitofp i32 [[VAR_TMP12]] to float +; CHECK-NEXT: [[VAR_TMP17:%.*]] = sitofp i32 [[SCALAR_RECUR]] to float ; CHECK-NEXT: store float [[VAR_TMP17]], ptr [[VAR_TMP13]], align 4 ; CHECK-NEXT: [[VAR_TMP18]] = getelementptr inbounds float, ptr [[VAR_TMP13]], i64 1 ; CHECK-NEXT: [[VAR_TMP19]] = add i32 [[VAR_TMP15]], [[ARG2]] ; CHECK-NEXT: [[VAR_TMP20]] = and i32 [[VAR_TMP19]], 65535 ; CHECK-NEXT: [[VAR_TMP21:%.*]] = icmp eq i32 [[VAR_TMP16]], 0 -; CHECK-NEXT: br i1 [[VAR_TMP21]], label [[BB22:%.*]], label [[BB11]] +; CHECK-NEXT: br i1 [[VAR_TMP21]], label [[BB22]], label [[BB11]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: bb22: -; CHECK-NEXT: [[VAR_TMP23:%.*]] = phi ptr [ [[VAR_TMP18]], [[BB11]] ] -; CHECK-NEXT: [[VAR_TMP24:%.*]] = phi i32 [ [[VAR_TMP19]], [[BB11]] ] -; CHECK-NEXT: [[VAR_TMP25:%.*]] = phi i32 [ [[VAR_TMP20]], [[BB11]] ] +; CHECK-NEXT: [[VAR_TMP23:%.*]] = phi ptr [ [[VAR_TMP18]], [[BB11]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[VAR_TMP24:%.*]] = phi i32 [ [[VAR_TMP19]], [[BB11]] ], [ [[IND_END4]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[VAR_TMP25:%.*]] = phi i32 [ [[VAR_TMP20]], [[BB11]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[VAR_TMP26:%.*]] = ashr i32 [[VAR_TMP24]], 16 ; CHECK-NEXT: store i32 [[VAR_TMP26]], ptr @global, align 4 ; CHECK-NEXT: br label [[BB27]]