diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -597,10 +597,18 @@ dominates(Previous, SinkCandidate, VPDT)) return true; - if (SeenPrevious.contains(SinkCandidate) || - SinkCandidate->mayHaveSideEffects()) + if (SeenPrevious.contains(SinkCandidate)) return false; + if (SinkCandidate->mayHaveSideEffects()) { + if (!isa(SinkCandidate)) + return false; + if (any_of(make_range(std::next(SinkCandidate->getIterator()), + std::next(Previous->getIterator())), + [&](VPRecipeBase &R) { return R.mayReadOrWriteMemory(); })) + return false; + } + WorkList.push_back(SinkCandidate); return true; }; diff --git a/llvm/test/Transforms/LoopVectorize/fixed-order-recurrences-memory-instructions.ll b/llvm/test/Transforms/LoopVectorize/fixed-order-recurrences-memory-instructions.ll --- a/llvm/test/Transforms/LoopVectorize/fixed-order-recurrences-memory-instructions.ll +++ b/llvm/test/Transforms/LoopVectorize/fixed-order-recurrences-memory-instructions.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s -define i32 @can_sink_load_that_uses_for_past_other_loads(ptr noalias %A, ptr noalias %B) { -; CHECK-LABEL: @can_sink_load_that_uses_for_past_other_loads( +define i32 @sink_load_that_uses_for_past_other_loads(ptr noalias %A, ptr noalias %B) { +; CHECK-LABEL: @sink_load_that_uses_for_past_other_loads( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: @@ -81,3 +81,205 @@ exit: ret void } + +define void @sink_store_that_uses_for_past_instructions(ptr noalias %A, ptr noalias %B) { +; CHECK-LABEL: @sink_store_that_uses_for_past_instructions( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2]] = add <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr [[TMP4]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 1000, 1000 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]] +; CHECK-NEXT: store i32 [[SCALAR_RECUR]], ptr [[GEP_A]], align 4 +; CHECK-NEXT: [[FOR_NEXT]] = add i32 [[IV]], 2 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 1000 +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %for = phi i32 [ 0, %entry ], [ %for.next, %loop ] + %gep.a = getelementptr inbounds i32, ptr %A, i32 %iv + store i32 %for, ptr %gep.a + %for.next = add i32 %iv, 2 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, 1000 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @sink_store_past_non_aliasing_load(ptr noalias %A, ptr noalias %B) { +; CHECK-LABEL: @sink_store_past_non_aliasing_load( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[FOR:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[IV]] +; CHECK-NEXT: store i32 [[FOR]], ptr [[GEP_A]], align 4 +; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[IV]] +; CHECK-NEXT: [[FOR_NEXT]] = load i32, ptr [[GEP_B]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 1000 +; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %for = phi i32 [ 0, %entry ], [ %for.next, %loop ] + %gep.a = getelementptr inbounds i32, ptr %A, i32 %iv + store i32 %for, ptr %gep.a + %gep.b = getelementptr inbounds i32, ptr %B, i32 %iv + %for.next = load i32, ptr %gep.b + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, 1000 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @sink_store_past_aliasing_load(ptr %A, ptr %B) { +; CHECK-LABEL: @sink_store_past_aliasing_load( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[FOR:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[IV]] +; CHECK-NEXT: store i32 [[FOR]], ptr [[GEP_A]], align 4 +; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[IV]] +; CHECK-NEXT: [[FOR_NEXT]] = load i32, ptr [[GEP_B]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 1000 +; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %for = phi i32 [ 0, %entry ], [ %for.next, %loop ] + %gep.a = getelementptr inbounds i32, ptr %A, i32 %iv + store i32 %for, ptr %gep.a + %gep.b = getelementptr inbounds i32, ptr %B, i32 %iv + %for.next = load i32, ptr %gep.b + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, 1000 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @sink_store_past_non_aliasing_store(ptr noalias %A, ptr noalias %B) { +; CHECK-LABEL: @sink_store_past_non_aliasing_store( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[FOR:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[IV]] +; CHECK-NEXT: store i32 [[FOR]], ptr [[GEP_A]], align 4 +; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[IV]] +; CHECK-NEXT: store i32 123, ptr [[GEP_B]], align 4 +; CHECK-NEXT: [[FOR_NEXT]] = add i32 [[IV]], 2 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 1000 +; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %for = phi i32 [ 0, %entry ], [ %for.next, %loop ] + %gep.a = getelementptr inbounds i32, ptr %A, i32 %iv + store i32 %for, ptr %gep.a + %gep.b = getelementptr inbounds i32, ptr %B, i32 %iv + store i32 123, ptr %gep.b + %for.next = add i32 %iv, 2 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, 1000 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @sink_store_past_aliasing_store(ptr %A, ptr %B) { +; CHECK-LABEL: @sink_store_past_aliasing_store( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[FOR:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[IV]] +; CHECK-NEXT: store i32 [[FOR]], ptr [[GEP_A]], align 4 +; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[IV]] +; CHECK-NEXT: store i32 123, ptr [[GEP_B]], align 4 +; CHECK-NEXT: [[FOR_NEXT]] = add i32 [[IV]], 2 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 1000 +; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %for = phi i32 [ 0, %entry ], [ %for.next, %loop ] + %gep.a = getelementptr inbounds i32, ptr %A, i32 %iv + store i32 %for, ptr %gep.a + %gep.b = getelementptr inbounds i32, ptr %B, i32 %iv + store i32 123, ptr %gep.b + %for.next = add i32 %iv, 2 + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, 1000 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll --- a/llvm/test/Transforms/LoopVectorize/induction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction.ll @@ -3490,50 +3490,50 @@ ; CHECK-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] ; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LEN]], 255 ; CHECK-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[LEN]] to i8 -; CHECK-NEXT: [[TMP7:%.*]] = add i8 [[T]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i8 [[TMP7]], [[T]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp ugt i32 [[LEN]], 255 -; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = or i1 [[TMP5]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[TMP5]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]] ; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; CHECK-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[DOTCAST]] -; CHECK-NEXT: [[IND_END2:%.*]] = add i32 [[EXT]], [[N_VEC]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[T]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], +; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i32 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[DOTCAST4:%.*]] = trunc i32 [[INDEX]] to i8 -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST4]] -; CHECK-NEXT: [[TMP12:%.*]] = add i8 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 -; CHECK-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP14]], align 4 +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[DOTCAST2:%.*]] = trunc i32 [[INDEX]] to i8 +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST2]] +; CHECK-NEXT: [[TMP6:%.*]] = add i8 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i8> [[VEC_IND]], +; CHECK-NEXT: [[TMP9]] = zext <2 x i8> [[TMP8]] to <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP9]], <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; CHECK-NEXT: store <2 x i32> [[TMP10]], ptr [[TMP11]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i8> [[VEC_IND]], +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[TMP9]], i32 1 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i32> [[TMP9]], i32 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[EXT]], [[VECTOR_SCEVCHECK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ], [ [[EXT]], [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[SPHI:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[IDX]] -; CHECK-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 +; CHECK-NEXT: store i32 [[SCALAR_RECUR]], ptr [[PTR]], align 4 ; CHECK-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; CHECK-NEXT: [[IDX_INC_EXT]] = zext i8 [[IDX_INC]] to i32 ; CHECK-NEXT: [[IDX_B_INC]] = add nuw nsw i32 [[IDX_B]], 1 @@ -3557,49 +3557,50 @@ ; IND-NEXT: [[TMP1:%.*]] = trunc i32 [[LEN]] to i8 ; IND-NEXT: [[TMP2:%.*]] = add i8 [[TMP1]], [[T]] ; IND-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] -; IND-NEXT: [[TMP4:%.*]] = trunc i32 [[LEN]] to i8 -; IND-NEXT: [[TMP5:%.*]] = xor i8 [[T]], -1 -; IND-NEXT: [[TMP6:%.*]] = icmp ult i8 [[TMP5]], [[TMP4]] -; IND-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[LEN]], 255 -; IND-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] -; IND-NEXT: [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]] -; IND-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; IND-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LEN]], 255 +; IND-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; IND-NEXT: br i1 [[TMP5]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; IND: vector.ph: ; IND-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], -2 ; IND-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; IND-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] -; IND-NEXT: [[IND_END2:%.*]] = add i32 [[N_VEC]], [[EXT]] -; IND-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0 -; IND-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; IND-NEXT: [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], +; IND-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[T]], i64 0 +; IND-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer +; IND-NEXT: [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], +; IND-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 1 ; IND-NEXT: br label [[VECTOR_BODY:%.*]] ; IND: vector.body: ; IND-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; IND-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; IND-NEXT: [[DOTCAST4:%.*]] = trunc i32 [[INDEX]] to i8 -; IND-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST4]], [[T]] -; IND-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 -; IND-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] -; IND-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 +; IND-NEXT: [[VEC_IND:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; IND-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; IND-NEXT: [[DOTCAST2:%.*]] = trunc i32 [[INDEX]] to i8 +; IND-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST2]], [[T]] +; IND-NEXT: [[TMP6:%.*]] = sext i8 [[OFFSET_IDX]] to i64 +; IND-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; IND-NEXT: [[TMP8:%.*]] = add <2 x i8> [[VEC_IND]], +; IND-NEXT: [[TMP9]] = zext <2 x i8> [[TMP8]] to <2 x i32> +; IND-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP9]], <2 x i32> +; IND-NEXT: store <2 x i32> [[TMP10]], ptr [[TMP7]], align 4 ; IND-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; IND-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; IND-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i8> [[VEC_IND]], +; IND-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; IND-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] ; IND: middle.block: ; IND-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] +; IND-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[TMP9]], i64 1 ; IND-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; IND: scalar.ph: -; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] -; IND-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] -; IND-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ], [ [[EXT]], [[VECTOR_SCEVCHECK]] ] +; IND-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[EXT]], [[VECTOR_SCEVCHECK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[T]], [[VECTOR_SCEVCHECK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] +; IND-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[LOOP_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] ; IND-NEXT: br label [[LOOP:%.*]] ; IND: loop: ; IND-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; IND-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] -; IND-NEXT: [[SPHI:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] -; IND-NEXT: [[TMP13:%.*]] = sext i8 [[IDX]] to i64 -; IND-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]] -; IND-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 +; IND-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] +; IND-NEXT: [[TMP12:%.*]] = sext i8 [[IDX]] to i64 +; IND-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]] +; IND-NEXT: store i32 [[SCALAR_RECUR]], ptr [[PTR]], align 4 ; IND-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; IND-NEXT: [[IDX_INC_EXT]] = zext i8 [[IDX_INC]] to i32 ; IND-NEXT: [[IDX_B_INC]] = add nuw nsw i32 [[IDX_B]], 1 @@ -3623,52 +3624,55 @@ ; UNROLL-NEXT: [[TMP1:%.*]] = trunc i32 [[LEN]] to i8 ; UNROLL-NEXT: [[TMP2:%.*]] = add i8 [[TMP1]], [[T]] ; UNROLL-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] -; UNROLL-NEXT: [[TMP4:%.*]] = trunc i32 [[LEN]] to i8 -; UNROLL-NEXT: [[TMP5:%.*]] = xor i8 [[T]], -1 -; UNROLL-NEXT: [[TMP6:%.*]] = icmp ult i8 [[TMP5]], [[TMP4]] -; UNROLL-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[LEN]], 255 -; UNROLL-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] -; UNROLL-NEXT: [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]] -; UNROLL-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; UNROLL-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LEN]], 255 +; UNROLL-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; UNROLL-NEXT: br i1 [[TMP5]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL: vector.ph: ; UNROLL-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], -4 ; UNROLL-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; UNROLL-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] -; UNROLL-NEXT: [[IND_END2:%.*]] = add i32 [[N_VEC]], [[EXT]] -; UNROLL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0 -; UNROLL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; UNROLL-NEXT: [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], +; UNROLL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[T]], i64 0 +; UNROLL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer +; UNROLL-NEXT: [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], +; UNROLL-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 1 ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: ; UNROLL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NEXT: [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8 -; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]] -; UNROLL-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 -; UNROLL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] -; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 -; UNROLL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 2 -; UNROLL-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4 +; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NEXT: [[DOTCAST3:%.*]] = trunc i32 [[INDEX]] to i8 +; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST3]], [[T]] +; UNROLL-NEXT: [[TMP6:%.*]] = sext i8 [[OFFSET_IDX]] to i64 +; UNROLL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; UNROLL-NEXT: [[TMP8:%.*]] = add <2 x i8> [[VEC_IND]], +; UNROLL-NEXT: [[TMP9:%.*]] = add <2 x i8> [[VEC_IND]], +; UNROLL-NEXT: [[TMP10:%.*]] = zext <2 x i8> [[TMP8]] to <2 x i32> +; UNROLL-NEXT: [[TMP11]] = zext <2 x i8> [[TMP9]] to <2 x i32> +; UNROLL-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP10]], <2 x i32> +; UNROLL-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> +; UNROLL-NEXT: store <2 x i32> [[TMP12]], ptr [[TMP7]], align 4 +; UNROLL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 2 +; UNROLL-NEXT: store <2 x i32> [[TMP13]], ptr [[TMP14]], align 4 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i8> [[VEC_IND]], +; UNROLL-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] ; UNROLL: middle.block: ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] +; UNROLL-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[TMP11]], i64 1 ; UNROLL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: -; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] -; UNROLL-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] -; UNROLL-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ], [ [[EXT]], [[VECTOR_SCEVCHECK]] ] +; UNROLL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[EXT]], [[VECTOR_SCEVCHECK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[T]], [[VECTOR_SCEVCHECK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] +; UNROLL-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[LOOP_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] ; UNROLL-NEXT: br label [[LOOP:%.*]] ; UNROLL: loop: ; UNROLL-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; UNROLL-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] -; UNROLL-NEXT: [[SPHI:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] -; UNROLL-NEXT: [[TMP14:%.*]] = sext i8 [[IDX]] to i64 -; UNROLL-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] -; UNROLL-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 +; UNROLL-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] +; UNROLL-NEXT: [[TMP16:%.*]] = sext i8 [[IDX]] to i64 +; UNROLL-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]] +; UNROLL-NEXT: store i32 [[SCALAR_RECUR]], ptr [[PTR]], align 4 ; UNROLL-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; UNROLL-NEXT: [[IDX_INC_EXT]] = zext i8 [[IDX_INC]] to i32 ; UNROLL-NEXT: [[IDX_B_INC]] = add nuw nsw i32 [[IDX_B]], 1 @@ -3695,55 +3699,58 @@ ; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LEN]], 255 ; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] -; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = trunc i32 [[LEN]] to i8 -; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = add i8 [[T]], [[TMP6]] -; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = icmp ult i8 [[TMP7]], [[T]] -; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = icmp ugt i32 [[LEN]], 255 -; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] -; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = or i1 [[TMP5]], [[TMP10]] -; UNROLL-NO-IC-NEXT: br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP5]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-IC: vector.ph: ; UNROLL-NO-IC-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 4 ; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]] ; UNROLL-NO-IC-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; UNROLL-NO-IC-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[DOTCAST]] -; UNROLL-NO-IC-NEXT: [[IND_END2:%.*]] = add i32 [[EXT]], [[N_VEC]] -; UNROLL-NO-IC-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0 -; UNROLL-NO-IC-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; UNROLL-NO-IC-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], +; UNROLL-NO-IC-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[T]], i64 0 +; UNROLL-NO-IC-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer +; UNROLL-NO-IC-NEXT: [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i32 1 ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NO-IC-NEXT: [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8 -; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST5]] -; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = add i8 [[OFFSET_IDX]], 0 -; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = add i8 [[OFFSET_IDX]], 2 -; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP12]] -; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[TMP13]] -; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0 -; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP16]], align 4 -; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 2 -; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP17]], align 4 +; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <2 x i8> [[VEC_IND]], +; UNROLL-NO-IC-NEXT: [[DOTCAST3:%.*]] = trunc i32 [[INDEX]] to i8 +; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST3]] +; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = add i8 [[OFFSET_IDX]], 0 +; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = add i8 [[OFFSET_IDX]], 2 +; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP6]] +; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[TMP7]] +; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = add <2 x i8> [[VEC_IND]], +; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = add <2 x i8> [[STEP_ADD]], +; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = zext <2 x i8> [[TMP10]] to <2 x i32> +; UNROLL-NO-IC-NEXT: [[TMP13]] = zext <2 x i8> [[TMP11]] to <2 x i32> +; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP12]], <2 x i32> +; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = shufflevector <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> +; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[TMP14]], ptr [[TMP16]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 2 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[TMP15]], ptr [[TMP17]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], +; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i8> [[STEP_ADD]], ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1 +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0 ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: +; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[EXT]], [[VECTOR_SCEVCHECK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ], [ [[EXT]], [[VECTOR_SCEVCHECK]] ] ; UNROLL-NO-IC-NEXT: br label [[LOOP:%.*]] ; UNROLL-NO-IC: loop: ; UNROLL-NO-IC-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; UNROLL-NO-IC-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] -; UNROLL-NO-IC-NEXT: [[SPHI:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] ; UNROLL-NO-IC-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[IDX]] -; UNROLL-NO-IC-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 +; UNROLL-NO-IC-NEXT: store i32 [[SCALAR_RECUR]], ptr [[PTR]], align 4 ; UNROLL-NO-IC-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; UNROLL-NO-IC-NEXT: [[IDX_INC_EXT]] = zext i8 [[IDX_INC]] to i32 ; UNROLL-NO-IC-NEXT: [[IDX_B_INC]] = add nuw nsw i32 [[IDX_B]], 1 @@ -3767,52 +3774,55 @@ ; INTERLEAVE-NEXT: [[TMP1:%.*]] = trunc i32 [[LEN]] to i8 ; INTERLEAVE-NEXT: [[TMP2:%.*]] = add i8 [[TMP1]], [[T]] ; INTERLEAVE-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] -; INTERLEAVE-NEXT: [[TMP4:%.*]] = trunc i32 [[LEN]] to i8 -; INTERLEAVE-NEXT: [[TMP5:%.*]] = xor i8 [[T]], -1 -; INTERLEAVE-NEXT: [[TMP6:%.*]] = icmp ult i8 [[TMP5]], [[TMP4]] -; INTERLEAVE-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[LEN]], 255 -; INTERLEAVE-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] -; INTERLEAVE-NEXT: [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]] -; INTERLEAVE-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; INTERLEAVE-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LEN]], 255 +; INTERLEAVE-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; INTERLEAVE-NEXT: br i1 [[TMP5]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; INTERLEAVE: vector.ph: ; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], -8 ; INTERLEAVE-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; INTERLEAVE-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] -; INTERLEAVE-NEXT: [[IND_END2:%.*]] = add i32 [[N_VEC]], [[EXT]] -; INTERLEAVE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[EXT]], i64 0 -; INTERLEAVE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; INTERLEAVE-NEXT: [[INDUCTION:%.*]] = add nuw nsw <4 x i32> [[DOTSPLAT]], +; INTERLEAVE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[T]], i64 0 +; INTERLEAVE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer +; INTERLEAVE-NEXT: [[INDUCTION:%.*]] = add <4 x i8> [[DOTSPLAT]], +; INTERLEAVE-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[EXT]], i64 3 ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], -; INTERLEAVE-NEXT: [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8 -; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]] -; INTERLEAVE-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 -; INTERLEAVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] -; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 -; INTERLEAVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 4 -; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4 +; INTERLEAVE-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[DOTCAST3:%.*]] = trunc i32 [[INDEX]] to i8 +; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST3]], [[T]] +; INTERLEAVE-NEXT: [[TMP6:%.*]] = sext i8 [[OFFSET_IDX]] to i64 +; INTERLEAVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; INTERLEAVE-NEXT: [[TMP8:%.*]] = add <4 x i8> [[VEC_IND]], +; INTERLEAVE-NEXT: [[TMP9:%.*]] = add <4 x i8> [[VEC_IND]], +; INTERLEAVE-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[TMP8]] to <4 x i32> +; INTERLEAVE-NEXT: [[TMP11]] = zext <4 x i8> [[TMP9]] to <4 x i32> +; INTERLEAVE-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP10]], <4 x i32> +; INTERLEAVE-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> +; INTERLEAVE-NEXT: store <4 x i32> [[TMP12]], ptr [[TMP7]], align 4 +; INTERLEAVE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 4 +; INTERLEAVE-NEXT: store <4 x i32> [[TMP13]], ptr [[TMP14]], align 4 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 -; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], -; INTERLEAVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; INTERLEAVE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], +; INTERLEAVE-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; INTERLEAVE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] ; INTERLEAVE: middle.block: ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] +; INTERLEAVE-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP11]], i64 3 ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: -; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] -; INTERLEAVE-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] -; INTERLEAVE-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ], [ [[EXT]], [[VECTOR_SCEVCHECK]] ] +; INTERLEAVE-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[EXT]], [[VECTOR_SCEVCHECK]] ], [ [[EXT]], [[LOOP_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[T]], [[VECTOR_SCEVCHECK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] +; INTERLEAVE-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[LOOP_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] ; INTERLEAVE-NEXT: br label [[LOOP:%.*]] ; INTERLEAVE: loop: ; INTERLEAVE-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; INTERLEAVE-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] -; INTERLEAVE-NEXT: [[SPHI:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] -; INTERLEAVE-NEXT: [[TMP14:%.*]] = sext i8 [[IDX]] to i64 -; INTERLEAVE-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] -; INTERLEAVE-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 +; INTERLEAVE-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[IDX_INC_EXT:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] +; INTERLEAVE-NEXT: [[TMP16:%.*]] = sext i8 [[IDX]] to i64 +; INTERLEAVE-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]] +; INTERLEAVE-NEXT: store i32 [[SCALAR_RECUR]], ptr [[PTR]], align 4 ; INTERLEAVE-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; INTERLEAVE-NEXT: [[IDX_INC_EXT]] = zext i8 [[IDX_INC]] to i32 ; INTERLEAVE-NEXT: [[IDX_B_INC]] = add nuw nsw i32 [[IDX_B]], 1 @@ -3871,51 +3881,51 @@ ; CHECK-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] ; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LEN]], 255 ; CHECK-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[LEN]] to i8 -; CHECK-NEXT: [[TMP7:%.*]] = add i8 [[T]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i8 [[TMP7]], [[T]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp ugt i32 [[LEN]], 255 -; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = or i1 [[TMP5]], [[TMP10]] -; CHECK-NEXT: br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[TMP5]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]] ; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; CHECK-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[DOTCAST]] -; CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[N_VEC]], 4 -; CHECK-NEXT: [[IND_END2:%.*]] = add i32 [[EXT_MUL]], [[TMP12]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[T]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], +; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i32 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[DOTCAST4:%.*]] = trunc i32 [[INDEX]] to i8 -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST4]] -; CHECK-NEXT: [[TMP13:%.*]] = add i8 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0 -; CHECK-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP15]], align 4 +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[DOTCAST2:%.*]] = trunc i32 [[INDEX]] to i8 +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST2]] +; CHECK-NEXT: [[TMP6:%.*]] = add i8 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i8> [[VEC_IND]], +; CHECK-NEXT: [[TMP9:%.*]] = zext <2 x i8> [[TMP8]] to <2 x i32> +; CHECK-NEXT: [[TMP10]] = mul <2 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP10]], <2 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; CHECK-NEXT: store <2 x i32> [[TMP11]], ptr [[TMP12]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i8> [[VEC_IND]], +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[IDX]] -; CHECK-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 +; CHECK-NEXT: store i32 [[SCALAR_RECUR]], ptr [[PTR]], align 4 ; CHECK-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; CHECK-NEXT: [[IDX_INC_EXT:%.*]] = zext i8 [[IDX_INC]] to i32 ; CHECK-NEXT: [[MUL]] = mul i32 [[IDX_INC_EXT]], 4 @@ -3941,50 +3951,51 @@ ; IND-NEXT: [[TMP1:%.*]] = trunc i32 [[LEN]] to i8 ; IND-NEXT: [[TMP2:%.*]] = add i8 [[TMP1]], [[T]] ; IND-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] -; IND-NEXT: [[TMP4:%.*]] = trunc i32 [[LEN]] to i8 -; IND-NEXT: [[TMP5:%.*]] = xor i8 [[T]], -1 -; IND-NEXT: [[TMP6:%.*]] = icmp ult i8 [[TMP5]], [[TMP4]] -; IND-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[LEN]], 255 -; IND-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] -; IND-NEXT: [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]] -; IND-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; IND-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LEN]], 255 +; IND-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; IND-NEXT: br i1 [[TMP5]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; IND: vector.ph: ; IND-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], -2 ; IND-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; IND-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] -; IND-NEXT: [[EXT_MUL5:%.*]] = add i32 [[N_VEC]], [[EXT]] -; IND-NEXT: [[IND_END2:%.*]] = shl i32 [[EXT_MUL5]], 2 -; IND-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0 -; IND-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; IND-NEXT: [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], +; IND-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[T]], i64 0 +; IND-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer +; IND-NEXT: [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], +; IND-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 1 ; IND-NEXT: br label [[VECTOR_BODY:%.*]] ; IND: vector.body: ; IND-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; IND-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; IND-NEXT: [[DOTCAST4:%.*]] = trunc i32 [[INDEX]] to i8 -; IND-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST4]], [[T]] -; IND-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 -; IND-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] -; IND-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 +; IND-NEXT: [[VEC_IND:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; IND-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; IND-NEXT: [[DOTCAST2:%.*]] = trunc i32 [[INDEX]] to i8 +; IND-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST2]], [[T]] +; IND-NEXT: [[TMP6:%.*]] = sext i8 [[OFFSET_IDX]] to i64 +; IND-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; IND-NEXT: [[TMP8:%.*]] = add <2 x i8> [[VEC_IND]], +; IND-NEXT: [[TMP9:%.*]] = zext <2 x i8> [[TMP8]] to <2 x i32> +; IND-NEXT: [[TMP10]] = shl nuw nsw <2 x i32> [[TMP9]], +; IND-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP10]], <2 x i32> +; IND-NEXT: store <2 x i32> [[TMP11]], ptr [[TMP7]], align 4 ; IND-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], +; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i8> [[VEC_IND]], ; IND-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; IND-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] ; IND: middle.block: ; IND-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] +; IND-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[TMP10]], i64 1 ; IND-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; IND: scalar.ph: -; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] -; IND-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] -; IND-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ] +; IND-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; IND-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[T]], [[VECTOR_SCEVCHECK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] +; IND-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[LOOP_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] ; IND-NEXT: br label [[LOOP:%.*]] ; IND: loop: ; IND-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; IND-NEXT: [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; IND-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] ; IND-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; IND-NEXT: [[TMP13:%.*]] = sext i8 [[IDX]] to i64 ; IND-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]] -; IND-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 +; IND-NEXT: store i32 [[SCALAR_RECUR]], ptr [[PTR]], align 4 ; IND-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; IND-NEXT: [[IDX_INC_EXT:%.*]] = zext i8 [[IDX_INC]] to i32 ; IND-NEXT: [[MUL]] = shl nuw nsw i32 [[IDX_INC_EXT]], 2 @@ -4010,53 +4021,57 @@ ; UNROLL-NEXT: [[TMP1:%.*]] = trunc i32 [[LEN]] to i8 ; UNROLL-NEXT: [[TMP2:%.*]] = add i8 [[TMP1]], [[T]] ; UNROLL-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] -; UNROLL-NEXT: [[TMP4:%.*]] = trunc i32 [[LEN]] to i8 -; UNROLL-NEXT: [[TMP5:%.*]] = xor i8 [[T]], -1 -; UNROLL-NEXT: [[TMP6:%.*]] = icmp ult i8 [[TMP5]], [[TMP4]] -; UNROLL-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[LEN]], 255 -; UNROLL-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] -; UNROLL-NEXT: [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]] -; UNROLL-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; UNROLL-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LEN]], 255 +; UNROLL-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; UNROLL-NEXT: br i1 [[TMP5]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL: vector.ph: ; UNROLL-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], -4 ; UNROLL-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; UNROLL-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] -; UNROLL-NEXT: [[EXT_MUL6:%.*]] = add i32 [[N_VEC]], [[EXT]] -; UNROLL-NEXT: [[IND_END2:%.*]] = shl i32 [[EXT_MUL6]], 2 -; UNROLL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0 -; UNROLL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; UNROLL-NEXT: [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], +; UNROLL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[T]], i64 0 +; UNROLL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer +; UNROLL-NEXT: [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], +; UNROLL-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 1 ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: ; UNROLL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NEXT: [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8 -; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]] -; UNROLL-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 -; UNROLL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] -; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 -; UNROLL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 2 -; UNROLL-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4 +; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NEXT: [[DOTCAST3:%.*]] = trunc i32 [[INDEX]] to i8 +; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST3]], [[T]] +; UNROLL-NEXT: [[TMP6:%.*]] = sext i8 [[OFFSET_IDX]] to i64 +; UNROLL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; UNROLL-NEXT: [[TMP8:%.*]] = add <2 x i8> [[VEC_IND]], +; UNROLL-NEXT: [[TMP9:%.*]] = add <2 x i8> [[VEC_IND]], +; UNROLL-NEXT: [[TMP10:%.*]] = zext <2 x i8> [[TMP8]] to <2 x i32> +; UNROLL-NEXT: [[TMP11:%.*]] = zext <2 x i8> [[TMP9]] to <2 x i32> +; UNROLL-NEXT: [[TMP12:%.*]] = shl nuw nsw <2 x i32> [[TMP10]], +; UNROLL-NEXT: [[TMP13]] = shl nuw nsw <2 x i32> [[TMP11]], +; UNROLL-NEXT: [[TMP14:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP12]], <2 x i32> +; UNROLL-NEXT: [[TMP15:%.*]] = shufflevector <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> +; UNROLL-NEXT: store <2 x i32> [[TMP14]], ptr [[TMP7]], align 4 +; UNROLL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 2 +; UNROLL-NEXT: store <2 x i32> [[TMP15]], ptr [[TMP16]], align 4 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i8> [[VEC_IND]], +; UNROLL-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] ; UNROLL: middle.block: ; UNROLL-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] +; UNROLL-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[TMP13]], i64 1 ; UNROLL-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; UNROLL: scalar.ph: -; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] -; UNROLL-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] -; UNROLL-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ] +; UNROLL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; UNROLL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[T]], [[VECTOR_SCEVCHECK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] +; UNROLL-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[LOOP_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] ; UNROLL-NEXT: br label [[LOOP:%.*]] ; UNROLL: loop: ; UNROLL-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; UNROLL-NEXT: [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; UNROLL-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] ; UNROLL-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] -; UNROLL-NEXT: [[TMP14:%.*]] = sext i8 [[IDX]] to i64 -; UNROLL-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] -; UNROLL-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 +; UNROLL-NEXT: [[TMP18:%.*]] = sext i8 [[IDX]] to i64 +; UNROLL-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]] +; UNROLL-NEXT: store i32 [[SCALAR_RECUR]], ptr [[PTR]], align 4 ; UNROLL-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; UNROLL-NEXT: [[IDX_INC_EXT:%.*]] = zext i8 [[IDX_INC]] to i32 ; UNROLL-NEXT: [[MUL]] = shl nuw nsw i32 [[IDX_INC_EXT]], 2 @@ -4085,56 +4100,60 @@ ; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LEN]], 255 ; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] -; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = trunc i32 [[LEN]] to i8 -; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = add i8 [[T]], [[TMP6]] -; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = icmp ult i8 [[TMP7]], [[T]] -; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = icmp ugt i32 [[LEN]], 255 -; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]] -; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = or i1 [[TMP5]], [[TMP10]] -; UNROLL-NO-IC-NEXT: br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP5]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-IC: vector.ph: ; UNROLL-NO-IC-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 4 ; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]] ; UNROLL-NO-IC-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; UNROLL-NO-IC-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[DOTCAST]] -; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = mul i32 [[N_VEC]], 4 -; UNROLL-NO-IC-NEXT: [[IND_END2:%.*]] = add i32 [[EXT_MUL]], [[TMP12]] -; UNROLL-NO-IC-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0 -; UNROLL-NO-IC-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; UNROLL-NO-IC-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], +; UNROLL-NO-IC-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i8> poison, i8 [[T]], i64 0 +; UNROLL-NO-IC-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT]], <2 x i8> poison, <2 x i32> zeroinitializer +; UNROLL-NO-IC-NEXT: [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i32 1 ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], -; UNROLL-NO-IC-NEXT: [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8 -; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST5]] -; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = add i8 [[OFFSET_IDX]], 0 -; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = add i8 [[OFFSET_IDX]], 2 -; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP13]] -; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[TMP14]] -; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 -; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP17]], align 4 -; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 2 -; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP18]], align 4 +; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <2 x i8> [[VEC_IND]], +; UNROLL-NO-IC-NEXT: [[DOTCAST3:%.*]] = trunc i32 [[INDEX]] to i8 +; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[DOTCAST3]] +; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = add i8 [[OFFSET_IDX]], 0 +; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = add i8 [[OFFSET_IDX]], 2 +; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP6]] +; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[TMP7]] +; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = add <2 x i8> [[VEC_IND]], +; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = add <2 x i8> [[STEP_ADD]], +; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = zext <2 x i8> [[TMP10]] to <2 x i32> +; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = zext <2 x i8> [[TMP11]] to <2 x i32> +; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = mul <2 x i32> [[TMP12]], +; UNROLL-NO-IC-NEXT: [[TMP15]] = mul <2 x i32> [[TMP13]], +; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP14]], <2 x i32> +; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = shufflevector <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], <2 x i32> +; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[TMP16]], ptr [[TMP18]], align 4 +; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 2 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[TMP17]], ptr [[TMP19]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], -; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-IC-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i8> [[STEP_ADD]], +; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1 +; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0 ; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-IC: scalar.ph: +; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] ; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ] ; UNROLL-NO-IC-NEXT: br label [[LOOP:%.*]] ; UNROLL-NO-IC: loop: ; UNROLL-NO-IC-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; UNROLL-NO-IC-NEXT: [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] ; UNROLL-NO-IC-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; UNROLL-NO-IC-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[IDX]] -; UNROLL-NO-IC-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 +; UNROLL-NO-IC-NEXT: store i32 [[SCALAR_RECUR]], ptr [[PTR]], align 4 ; UNROLL-NO-IC-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; UNROLL-NO-IC-NEXT: [[IDX_INC_EXT:%.*]] = zext i8 [[IDX_INC]] to i32 ; UNROLL-NO-IC-NEXT: [[MUL]] = mul i32 [[IDX_INC_EXT]], 4 @@ -4160,53 +4179,57 @@ ; INTERLEAVE-NEXT: [[TMP1:%.*]] = trunc i32 [[LEN]] to i8 ; INTERLEAVE-NEXT: [[TMP2:%.*]] = add i8 [[TMP1]], [[T]] ; INTERLEAVE-NEXT: [[TMP3:%.*]] = icmp slt i8 [[TMP2]], [[T]] -; INTERLEAVE-NEXT: [[TMP4:%.*]] = trunc i32 [[LEN]] to i8 -; INTERLEAVE-NEXT: [[TMP5:%.*]] = xor i8 [[T]], -1 -; INTERLEAVE-NEXT: [[TMP6:%.*]] = icmp ult i8 [[TMP5]], [[TMP4]] -; INTERLEAVE-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[LEN]], 255 -; INTERLEAVE-NEXT: [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]] -; INTERLEAVE-NEXT: [[TMP9:%.*]] = or i1 [[TMP3]], [[TMP8]] -; INTERLEAVE-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; INTERLEAVE-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LEN]], 255 +; INTERLEAVE-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; INTERLEAVE-NEXT: br i1 [[TMP5]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; INTERLEAVE: vector.ph: ; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i32 [[TMP0]], -8 ; INTERLEAVE-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i8 ; INTERLEAVE-NEXT: [[IND_END:%.*]] = add i8 [[DOTCAST]], [[T]] -; INTERLEAVE-NEXT: [[EXT_MUL6:%.*]] = add i32 [[N_VEC]], [[EXT]] -; INTERLEAVE-NEXT: [[IND_END2:%.*]] = shl i32 [[EXT_MUL6]], 2 -; INTERLEAVE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[EXT_MUL]], i64 0 -; INTERLEAVE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; INTERLEAVE-NEXT: [[INDUCTION:%.*]] = add nuw nsw <4 x i32> [[DOTSPLAT]], +; INTERLEAVE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[T]], i64 0 +; INTERLEAVE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer +; INTERLEAVE-NEXT: [[INDUCTION:%.*]] = add <4 x i8> [[DOTSPLAT]], +; INTERLEAVE-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[EXT_MUL]], i64 3 ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; INTERLEAVE-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], -; INTERLEAVE-NEXT: [[DOTCAST5:%.*]] = trunc i32 [[INDEX]] to i8 -; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]] -; INTERLEAVE-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 -; INTERLEAVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] -; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 -; INTERLEAVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 4 -; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4 +; INTERLEAVE-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-NEXT: [[DOTCAST3:%.*]] = trunc i32 [[INDEX]] to i8 +; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST3]], [[T]] +; INTERLEAVE-NEXT: [[TMP6:%.*]] = sext i8 [[OFFSET_IDX]] to i64 +; INTERLEAVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] +; INTERLEAVE-NEXT: [[TMP8:%.*]] = add <4 x i8> [[VEC_IND]], +; INTERLEAVE-NEXT: [[TMP9:%.*]] = add <4 x i8> [[VEC_IND]], +; INTERLEAVE-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[TMP8]] to <4 x i32> +; INTERLEAVE-NEXT: [[TMP11:%.*]] = zext <4 x i8> [[TMP9]] to <4 x i32> +; INTERLEAVE-NEXT: [[TMP12:%.*]] = shl nuw nsw <4 x i32> [[TMP10]], +; INTERLEAVE-NEXT: [[TMP13]] = shl nuw nsw <4 x i32> [[TMP11]], +; INTERLEAVE-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP12]], <4 x i32> +; INTERLEAVE-NEXT: [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> +; INTERLEAVE-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP7]], align 4 +; INTERLEAVE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 4 +; INTERLEAVE-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP16]], align 4 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 -; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], -; INTERLEAVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; INTERLEAVE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], +; INTERLEAVE-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; INTERLEAVE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] ; INTERLEAVE: middle.block: ; INTERLEAVE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] +; INTERLEAVE-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP13]], i64 3 ; INTERLEAVE-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; INTERLEAVE: scalar.ph: -; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[T]], [[VECTOR_SCEVCHECK]] ] -; INTERLEAVE-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ] -; INTERLEAVE-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ] +; INTERLEAVE-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[EXT_MUL]], [[VECTOR_SCEVCHECK]] ], [ [[EXT_MUL]], [[LOOP_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[T]], [[VECTOR_SCEVCHECK]] ], [ [[T]], [[LOOP_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] +; INTERLEAVE-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[LOOP_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] ; INTERLEAVE-NEXT: br label [[LOOP:%.*]] ; INTERLEAVE: loop: ; INTERLEAVE-NEXT: [[IDX:%.*]] = phi i8 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; INTERLEAVE-NEXT: [[SPHI:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; INTERLEAVE-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[MUL:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] ; INTERLEAVE-NEXT: [[IDX_B:%.*]] = phi i32 [ [[IDX_B_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] -; INTERLEAVE-NEXT: [[TMP14:%.*]] = sext i8 [[IDX]] to i64 -; INTERLEAVE-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]] -; INTERLEAVE-NEXT: store i32 [[SPHI]], ptr [[PTR]], align 4 +; INTERLEAVE-NEXT: [[TMP18:%.*]] = sext i8 [[IDX]] to i64 +; INTERLEAVE-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]] +; INTERLEAVE-NEXT: store i32 [[SCALAR_RECUR]], ptr [[PTR]], align 4 ; INTERLEAVE-NEXT: [[IDX_INC]] = add i8 [[IDX]], 1 ; INTERLEAVE-NEXT: [[IDX_INC_EXT:%.*]] = zext i8 [[IDX_INC]] to i32 ; INTERLEAVE-NEXT: [[MUL]] = shl nuw nsw i32 [[IDX_INC_EXT]], 2 diff --git a/llvm/test/Transforms/LoopVectorize/pr33706.ll b/llvm/test/Transforms/LoopVectorize/pr33706.ll --- a/llvm/test/Transforms/LoopVectorize/pr33706.ll +++ b/llvm/test/Transforms/LoopVectorize/pr33706.ll @@ -1,12 +1,116 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 < %s | FileCheck %s @global = local_unnamed_addr global i32 0, align 4 @global.1 = local_unnamed_addr global i32 0, align 4 @global.2 = local_unnamed_addr global float 0x3EF0000000000000, align 4 -; CHECK-LABEL: @PR33706 -; CHECK-NOT: <2 x i32> define void @PR33706(ptr nocapture readonly %arg, ptr nocapture %arg1, i32 %arg2) local_unnamed_addr { +; CHECK-LABEL: @PR33706( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP:%.*]] = load i32, ptr @global.1, align 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[ARG:%.*]], i64 190 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[ARG1:%.*]], i64 512 +; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP]], 65535 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i32 [[ARG2:%.*]], 65536 +; CHECK-NEXT: br i1 [[TMP6]], label [[BB7:%.*]], label [[BB9:%.*]] +; CHECK: bb7: +; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr @global, align 4 +; CHECK-NEXT: br label [[BB27:%.*]] +; CHECK: bb9: +; CHECK-NEXT: [[TMP10:%.*]] = udiv i32 65536, [[ARG2]] +; CHECK-NEXT: [[TMP0:%.*]] = add nsw i32 [[TMP10]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[N_VEC]], 4 +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[TMP4]], i64 [[TMP3]] +; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[IND_END1:%.*]] = sub i32 [[TMP10]], [[DOTCAST]] +; CHECK-NEXT: [[DOTCAST3:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = mul i32 [[DOTCAST3]], [[ARG2]] +; CHECK-NEXT: [[IND_END4:%.*]] = add i32 [[TMP]], [[TMP4]] +; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 1 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[DOTSPLATINSERT6:%.*]] = insertelement <2 x i32> poison, i32 [[ARG2]], i64 0 +; CHECK-NEXT: [[DOTSPLAT7:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT6]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = mul <2 x i32> , [[DOTSPLAT7]] +; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[ARG2]], 2 +; CHECK-NEXT: [[DOTSPLATINSERT8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP6]], i64 0 +; CHECK-NEXT: [[DOTSPLAT9:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT8]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[ARG2]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[TMP4]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[NEXT_GEP]], i64 1 +; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP11]] = and <2 x i32> [[TMP10]], +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP11]], <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = sitofp <2 x i32> [[TMP12]] to <2 x float> +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr float, ptr [[NEXT_GEP]], i32 0 +; CHECK-NEXT: store <2 x float> [[TMP13]], ptr [[TMP14]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], [[DOTSPLAT9]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[BB22:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP5]], [[BB9]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[TMP4]], [[BB9]] ] +; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ [[TMP10]], [[BB9]] ] +; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[IND_END4]], [[MIDDLE_BLOCK]] ], [ [[TMP]], [[BB9]] ] +; CHECK-NEXT: br label [[BB11:%.*]] +; CHECK: bb11: +; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[TMP20:%.*]], [[BB11]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[TMP13:%.*]] = phi ptr [ [[TMP18:%.*]], [[BB11]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ [[TMP16:%.*]], [[BB11]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[TMP15:%.*]] = phi i32 [ [[TMP19:%.*]], [[BB11]] ], [ [[BC_RESUME_VAL5]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[TMP16]] = add nsw i32 [[TMP14]], -1 +; CHECK-NEXT: [[TMP17:%.*]] = sitofp i32 [[SCALAR_RECUR]] to float +; CHECK-NEXT: store float [[TMP17]], ptr [[TMP13]], align 4 +; CHECK-NEXT: [[TMP18]] = getelementptr inbounds float, ptr [[TMP13]], i64 1 +; CHECK-NEXT: [[TMP19]] = add i32 [[TMP15]], [[ARG2]] +; CHECK-NEXT: [[TMP20]] = and i32 [[TMP19]], 65535 +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP16]], 0 +; CHECK-NEXT: br i1 [[TMP21]], label [[BB22]], label [[BB11]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: bb22: +; CHECK-NEXT: [[TMP23:%.*]] = phi ptr [ [[TMP18]], [[BB11]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP24:%.*]] = phi i32 [ [[TMP19]], [[BB11]] ], [ [[IND_END4]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP25:%.*]] = phi i32 [ [[TMP20]], [[BB11]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP26:%.*]] = ashr i32 [[TMP24]], 16 +; CHECK-NEXT: store i32 [[TMP26]], ptr @global, align 4 +; CHECK-NEXT: br label [[BB27]] +; CHECK: bb27: +; CHECK-NEXT: [[TMP28:%.*]] = phi i32 [ [[TMP26]], [[BB22]] ], [ [[TMP8]], [[BB7]] ] +; CHECK-NEXT: [[TMP29:%.*]] = phi ptr [ [[TMP23]], [[BB22]] ], [ [[TMP4]], [[BB7]] ] +; CHECK-NEXT: [[TMP30:%.*]] = phi i32 [ [[TMP25]], [[BB22]] ], [ [[TMP5]], [[BB7]] ] +; CHECK-NEXT: [[TMP31:%.*]] = sext i32 [[TMP28]] to i64 +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4 +; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP30]] to float +; CHECK-NEXT: [[TMP35:%.*]] = load float, ptr @global.2, align 4 +; CHECK-NEXT: [[TMP36:%.*]] = fmul float [[TMP35]], [[TMP34]] +; CHECK-NEXT: [[TMP37:%.*]] = fadd float [[TMP33]], [[TMP36]] +; CHECK-NEXT: store float [[TMP37]], ptr [[TMP29]], align 4 +; CHECK-NEXT: ret void +; bb: %tmp = load i32, ptr @global.1, align 4 %tmp3 = getelementptr inbounds float, ptr %arg, i64 190