diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -975,6 +975,16 @@ } } + // Fold (add (zext (add X, -1)), 1) -> (zext X) if X is non-zero. + // TODO: There's a general form for any constant on the outer add. + if (C->isOne()) { + if (match(Op0, m_ZExt(m_Add(m_Value(X), m_AllOnes())))) { + const SimplifyQuery Q = SQ.getWithInstruction(&Add); + if (llvm::isKnownNonZero(X, DL, 0, Q.AC, Q.CxtI, Q.DT)) + return new ZExtInst(X, Ty); + } + } + return nullptr; } diff --git a/llvm/test/Transforms/InstCombine/add.ll b/llvm/test/Transforms/InstCombine/add.ll --- a/llvm/test/Transforms/InstCombine/add.ll +++ b/llvm/test/Transforms/InstCombine/add.ll @@ -2873,3 +2873,74 @@ %r = add i8 %zgt0, %signbit ret i8 %r } + +define i32 @dec_zext_add_assume_nonzero(i8 %x) { +; CHECK-LABEL: @dec_zext_add_assume_nonzero( +; CHECK-NEXT: [[Z:%.*]] = icmp ne i8 [[X:%.*]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[Z]]) +; CHECK-NEXT: [[C:%.*]] = zext i8 [[X]] to i32 +; CHECK-NEXT: ret i32 [[C]] +; + %z = icmp ne i8 %x, 0 + call void @llvm.assume(i1 %z) + %a = add i8 %x, -1 + %b = zext i8 %a to i32 + %c = add i32 %b, 1 + ret i32 %c +} + +define i32 @dec_zext_add_nonzero(i8 %x) { +; CHECK-LABEL: @dec_zext_add_nonzero( +; CHECK-NEXT: [[O:%.*]] = or i8 [[X:%.*]], 4 +; CHECK-NEXT: [[C:%.*]] = zext i8 [[O]] to i32 +; CHECK-NEXT: ret i32 [[C]] +; + %o = or i8 %x, 4 + %a = add i8 %o, -1 + %b = zext i8 %a to i32 + %c = add i32 %b, 1 + ret i32 %c +} + +define <2 x i32> @dec_zext_add_nonzero_vec(<2 x i8> %x) { +; CHECK-LABEL: @dec_zext_add_nonzero_vec( +; CHECK-NEXT: [[O:%.*]] = or <2 x i8> [[X:%.*]], +; CHECK-NEXT: [[C:%.*]] = zext <2 x i8> [[O]] to <2 x i32> +; CHECK-NEXT: ret <2 x i32> [[C]] +; + %o = or <2 x i8> %x, + %a = add <2 x i8> %o, + %b = zext <2 x i8> %a to <2 x i32> + %c = add <2 x i32> %b, + ret <2 x i32> %c +} + +define <2 x i32> @dec_zext_add_nonzero_vec_poison1(<2 x i8> %x) { +; CHECK-LABEL: @dec_zext_add_nonzero_vec_poison1( +; CHECK-NEXT: [[O:%.*]] = or <2 x i8> [[X:%.*]], +; CHECK-NEXT: [[C:%.*]] = zext <2 x i8> [[O]] to <2 x i32> +; CHECK-NEXT: ret <2 x i32> [[C]] +; + %o = or <2 x i8> %x, + %a = add <2 x i8> %o, + %b = zext <2 x i8> %a to <2 x i32> + %c = add <2 x i32> %b, + ret <2 x i32> %c +} + +define <2 x i32> @dec_zext_add_nonzero_vec_poison2(<2 x i8> %x) { +; CHECK-LABEL: @dec_zext_add_nonzero_vec_poison2( +; CHECK-NEXT: [[O:%.*]] = or <2 x i8> [[X:%.*]], +; CHECK-NEXT: [[A:%.*]] = add nsw <2 x i8> [[O]], +; CHECK-NEXT: [[B:%.*]] = zext <2 x i8> [[A]] to <2 x i32> +; CHECK-NEXT: [[C:%.*]] = add <2 x i32> [[B]], +; CHECK-NEXT: ret <2 x i32> [[C]] +; + %o = or <2 x i8> %x, + %a = add <2 x i8> %o, + %b = zext <2 x i8> %a to <2 x i32> + %c = add <2 x i32> %b, + ret <2 x i32> %c +} + +declare void @llvm.assume(i1) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll @@ -18,19 +18,17 @@ ; CHECK-NEXT: [[CMP_NOT6:%.*]] = icmp eq i32 [[BLOCKSIZE:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP_NOT6]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]] ; CHECK: while.body.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BLOCKSIZE]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[BLOCKSIZE]] to i64 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 16 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -16 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967280 ; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 ; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST]] -; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 -; CHECK-NEXT: [[IND_END1:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 -; CHECK-NEXT: [[IND_END3:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 +; CHECK-NEXT: [[IND_END1:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 +; CHECK-NEXT: [[IND_END3:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[TMP2]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET]], i64 0 @@ -38,23 +36,23 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP4]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 8 -; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD]], <8 x i16> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD8]], <8 x i16> [[BROADCAST_SPLAT10]]) -; CHECK-NEXT: store <8 x i16> [[TMP8]], ptr [[NEXT_GEP6]], align 2 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[NEXT_GEP6]], i64 8 -; CHECK-NEXT: store <8 x i16> [[TMP9]], ptr [[TMP10]], align 2 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 8 +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i16>, ptr [[TMP5]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD]], <8 x i16> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD8]], <8 x i16> [[BROADCAST_SPLAT10]]) +; CHECK-NEXT: store <8 x i16> [[TMP6]], ptr [[NEXT_GEP6]], align 2 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[NEXT_GEP6]], i64 8 +; CHECK-NEXT: store <8 x i16> [[TMP7]], ptr [[TMP8]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[BLOCKSIZE]], [[WHILE_BODY_PREHEADER]] ] @@ -66,13 +64,13 @@ ; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i16, ptr [[PSRC_ADDR_08]], i64 1 -; CHECK-NEXT: [[TMP12:%.*]] = load i16, ptr [[PSRC_ADDR_08]], align 2 -; CHECK-NEXT: [[TMP13:%.*]] = tail call i16 @llvm.sadd.sat.i16(i16 [[TMP12]], i16 [[OFFSET]]) +; CHECK-NEXT: [[TMP10:%.*]] = load i16, ptr [[PSRC_ADDR_08]], align 2 +; CHECK-NEXT: [[TMP11:%.*]] = tail call i16 @llvm.sadd.sat.i16(i16 [[TMP10]], i16 [[OFFSET]]) ; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i16, ptr [[PDST_ADDR_07]], i64 1 -; CHECK-NEXT: store i16 [[TMP13]], ptr [[PDST_ADDR_07]], align 2 +; CHECK-NEXT: store i16 [[TMP11]], ptr [[PDST_ADDR_07]], align 2 ; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_09]], -1 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0 -; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: while.end: ; CHECK-NEXT: ret void ; @@ -110,16 +108,14 @@ ; CHECK-NEXT: [[CMP_NOT6:%.*]] = icmp eq i32 [[BLOCKSIZE:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP_NOT6]], label [[WHILE_END:%.*]], label [[ITER_CHECK:%.*]] ; CHECK: iter.check: -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BLOCKSIZE]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[BLOCKSIZE]] to i64 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 8 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[BLOCKSIZE]], 32 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -32 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967264 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET]], i64 0 @@ -130,30 +126,30 @@ ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 16 -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP3]], align 2 -; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD]], <16 x i8> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD5]], <16 x i8> [[BROADCAST_SPLAT7]]) -; CHECK-NEXT: store <16 x i8> [[TMP4]], ptr [[NEXT_GEP3]], align 2 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 16 -; CHECK-NEXT: store <16 x i8> [[TMP5]], ptr [[TMP6]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 16 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP1]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD]], <16 x i8> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD5]], <16 x i8> [[BROADCAST_SPLAT7]]) +; CHECK-NEXT: store <16 x i8> [[TMP2]], ptr [[NEXT_GEP3]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 16 +; CHECK-NEXT: store <16 x i8> [[TMP3]], ptr [[TMP4]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[IND_END20:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]] ; CHECK-NEXT: [[IND_END17:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]] ; CHECK-NEXT: [[DOTCAST13:%.*]] = trunc i64 [[N_VEC]] to i32 ; CHECK-NEXT: [[IND_END14:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST13]] -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP2]], 24 +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 24 ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_VEC11:%.*]] = and i64 [[TMP2]], -8 +; CHECK-NEXT: [[N_VEC11:%.*]] = and i64 [[TMP0]], 4294967288 ; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC11]] to i32 ; CHECK-NEXT: [[IND_END12:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST]] ; CHECK-NEXT: [[IND_END16:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC11]] @@ -166,13 +162,13 @@ ; CHECK-NEXT: [[NEXT_GEP24:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[INDEX23]] ; CHECK-NEXT: [[NEXT_GEP25:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[INDEX23]] ; CHECK-NEXT: [[WIDE_LOAD26:%.*]] = load <8 x i8>, ptr [[NEXT_GEP24]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[WIDE_LOAD26]], <8 x i8> [[BROADCAST_SPLAT28]]) -; CHECK-NEXT: store <8 x i8> [[TMP8]], ptr [[NEXT_GEP25]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[WIDE_LOAD26]], <8 x i8> [[BROADCAST_SPLAT28]]) +; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[NEXT_GEP25]], align 2 ; CHECK-NEXT: [[INDEX_NEXT29]] = add nuw i64 [[INDEX23]], 8 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT29]], [[N_VEC11]] -; CHECK-NEXT: br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT29]], [[N_VEC11]] +; CHECK-NEXT: br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N22:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC11]] +; CHECK-NEXT: [[CMP_N22:%.*]] = icmp eq i64 [[N_VEC11]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N22]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL15:%.*]] = phi i32 [ [[IND_END12]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END14]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] @@ -184,10 +180,10 @@ ; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL18]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL21]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[PSRC_ADDR_08]], i64 1 -; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[PSRC_ADDR_08]], align 2 -; CHECK-NEXT: [[TMP11:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TMP10]], i8 [[OFFSET]]) +; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[PSRC_ADDR_08]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TMP8]], i8 [[OFFSET]]) ; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i8, ptr [[PDST_ADDR_07]], i64 1 -; CHECK-NEXT: store i8 [[TMP11]], ptr [[PDST_ADDR_07]], align 2 +; CHECK-NEXT: store i8 [[TMP9]], ptr [[PDST_ADDR_07]], align 2 ; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_09]], -1 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0 ; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP6:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll --- a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll @@ -19,16 +19,14 @@ ; CHECK-NEXT: [[CMP_NOT6:%.*]] = icmp eq i32 [[BLOCKSIZE:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP_NOT6]], label [[WHILE_END:%.*]], label [[ITER_CHECK:%.*]] ; CHECK: iter.check: -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BLOCKSIZE]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[BLOCKSIZE]] to i64 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 8 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[BLOCKSIZE]], 64 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -64 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967232 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i16> poison, i16 [[OFFSET:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT]], <16 x i16> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <16 x i16> poison, i16 [[OFFSET]], i64 0 @@ -40,70 +38,70 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[TMP2]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i16>, ptr [[NEXT_GEP]], align 2 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 16 -; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i16>, ptr [[TMP5]], align 2 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 32 -; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i16>, ptr [[TMP6]], align 2 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 48 -; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i16>, ptr [[TMP7]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD]], <16 x i16> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD9]], <16 x i16> [[BROADCAST_SPLAT13]]) -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD10]], <16 x i16> [[BROADCAST_SPLAT15]]) -; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD11]], <16 x i16> [[BROADCAST_SPLAT17]]) -; CHECK-NEXT: store <16 x i16> [[TMP8]], ptr [[NEXT_GEP5]], align 2 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i16, ptr [[NEXT_GEP5]], i64 16 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 16 +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i16>, ptr [[TMP3]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 32 +; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i16>, ptr [[TMP4]], align 2 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 48 +; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i16>, ptr [[TMP5]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD]], <16 x i16> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD9]], <16 x i16> [[BROADCAST_SPLAT13]]) +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD10]], <16 x i16> [[BROADCAST_SPLAT15]]) +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD11]], <16 x i16> [[BROADCAST_SPLAT17]]) +; CHECK-NEXT: store <16 x i16> [[TMP6]], ptr [[NEXT_GEP5]], align 2 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[NEXT_GEP5]], i64 16 +; CHECK-NEXT: store <16 x i16> [[TMP7]], ptr [[TMP10]], align 2 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[NEXT_GEP5]], i64 32 +; CHECK-NEXT: store <16 x i16> [[TMP8]], ptr [[TMP11]], align 2 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i16, ptr [[NEXT_GEP5]], i64 48 ; CHECK-NEXT: store <16 x i16> [[TMP9]], ptr [[TMP12]], align 2 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i16, ptr [[NEXT_GEP5]], i64 32 -; CHECK-NEXT: store <16 x i16> [[TMP10]], ptr [[TMP13]], align 2 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i16, ptr [[NEXT_GEP5]], i64 48 -; CHECK-NEXT: store <16 x i16> [[TMP11]], ptr [[TMP14]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 -; CHECK-NEXT: [[IND_END30:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 -; CHECK-NEXT: [[IND_END27:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP17]] +; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 +; CHECK-NEXT: [[IND_END30:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 +; CHECK-NEXT: [[IND_END27:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP15]] ; CHECK-NEXT: [[DOTCAST23:%.*]] = trunc i64 [[N_VEC]] to i32 ; CHECK-NEXT: [[IND_END24:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST23]] -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP2]], 56 +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 56 ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_VEC21:%.*]] = and i64 [[TMP2]], -8 +; CHECK-NEXT: [[N_VEC21:%.*]] = and i64 [[TMP0]], 4294967288 ; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC21]] to i32 ; CHECK-NEXT: [[IND_END22:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST]] -; CHECK-NEXT: [[TMP18:%.*]] = shl nuw nsw i64 [[N_VEC21]], 1 -; CHECK-NEXT: [[IND_END26:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP18]] -; CHECK-NEXT: [[TMP19:%.*]] = shl nuw nsw i64 [[N_VEC21]], 1 -; CHECK-NEXT: [[IND_END29:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP19]] +; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[N_VEC21]], 1 +; CHECK-NEXT: [[IND_END26:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[N_VEC21]], 1 +; CHECK-NEXT: [[IND_END29:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP17]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT37:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT38:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT37]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: ; CHECK-NEXT: [[INDEX33:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT39:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP20:%.*]] = shl i64 [[INDEX33]], 1 -; CHECK-NEXT: [[NEXT_GEP34:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP20]] -; CHECK-NEXT: [[TMP21:%.*]] = shl i64 [[INDEX33]], 1 -; CHECK-NEXT: [[NEXT_GEP35:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP18:%.*]] = shl i64 [[INDEX33]], 1 +; CHECK-NEXT: [[NEXT_GEP34:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP18]] +; CHECK-NEXT: [[TMP19:%.*]] = shl i64 [[INDEX33]], 1 +; CHECK-NEXT: [[NEXT_GEP35:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP19]] ; CHECK-NEXT: [[WIDE_LOAD36:%.*]] = load <8 x i16>, ptr [[NEXT_GEP34]], align 2 -; CHECK-NEXT: [[TMP22:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[WIDE_LOAD36]], <8 x i16> [[BROADCAST_SPLAT38]]) -; CHECK-NEXT: store <8 x i16> [[TMP22]], ptr [[NEXT_GEP35]], align 2 +; CHECK-NEXT: [[TMP20:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[WIDE_LOAD36]], <8 x i16> [[BROADCAST_SPLAT38]]) +; CHECK-NEXT: store <8 x i16> [[TMP20]], ptr [[NEXT_GEP35]], align 2 ; CHECK-NEXT: [[INDEX_NEXT39]] = add nuw i64 [[INDEX33]], 8 -; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT39]], [[N_VEC21]] -; CHECK-NEXT: br i1 [[TMP23]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT39]], [[N_VEC21]] +; CHECK-NEXT: br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N32:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC21]] +; CHECK-NEXT: [[CMP_N32:%.*]] = icmp eq i64 [[N_VEC21]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N32]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL25:%.*]] = phi i32 [ [[IND_END22]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END24]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] @@ -115,10 +113,10 @@ ; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL28]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL31]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i16, ptr [[PSRC_ADDR_08]], i64 1 -; CHECK-NEXT: [[TMP24:%.*]] = load i16, ptr [[PSRC_ADDR_08]], align 2 -; CHECK-NEXT: [[TMP25:%.*]] = tail call i16 @llvm.uadd.sat.i16(i16 [[TMP24]], i16 [[OFFSET]]) +; CHECK-NEXT: [[TMP22:%.*]] = load i16, ptr [[PSRC_ADDR_08]], align 2 +; CHECK-NEXT: [[TMP23:%.*]] = tail call i16 @llvm.uadd.sat.i16(i16 [[TMP22]], i16 [[OFFSET]]) ; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i16, ptr [[PDST_ADDR_07]], i64 1 -; CHECK-NEXT: store i16 [[TMP25]], ptr [[PDST_ADDR_07]], align 2 +; CHECK-NEXT: store i16 [[TMP23]], ptr [[PDST_ADDR_07]], align 2 ; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_09]], -1 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0 ; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -160,16 +158,14 @@ ; CHECK-NEXT: [[CMP_NOT6:%.*]] = icmp eq i32 [[BLOCKSIZE:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP_NOT6]], label [[WHILE_END:%.*]], label [[ITER_CHECK:%.*]] ; CHECK: iter.check: -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BLOCKSIZE]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[BLOCKSIZE]] to i64 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[BLOCKSIZE]], 16 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[BLOCKSIZE]], 128 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -128 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967168 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <32 x i8> poison, i8 [[OFFSET:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x i8> [[BROADCAST_SPLATINSERT]], <32 x i8> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <32 x i8> poison, i8 [[OFFSET]], i64 0 @@ -184,40 +180,40 @@ ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[NEXT_GEP]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 32 -; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <32 x i8>, ptr [[TMP3]], align 2 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 64 -; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <32 x i8>, ptr [[TMP4]], align 2 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 96 -; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <32 x i8>, ptr [[TMP5]], align 2 -; CHECK-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD]], <32 x i8> [[WIDE_LOAD]], <32 x i8> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: [[TMP7:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD9]], <32 x i8> [[WIDE_LOAD9]], <32 x i8> [[BROADCAST_SPLAT13]]) -; CHECK-NEXT: [[TMP8:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD10]], <32 x i8> [[WIDE_LOAD10]], <32 x i8> [[BROADCAST_SPLAT15]]) -; CHECK-NEXT: [[TMP9:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD11]], <32 x i8> [[WIDE_LOAD11]], <32 x i8> [[BROADCAST_SPLAT17]]) -; CHECK-NEXT: store <32 x i8> [[TMP6]], ptr [[NEXT_GEP5]], align 2 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 32 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 32 +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <32 x i8>, ptr [[TMP1]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 64 +; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <32 x i8>, ptr [[TMP2]], align 2 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 96 +; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <32 x i8>, ptr [[TMP3]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD]], <32 x i8> [[WIDE_LOAD]], <32 x i8> [[BROADCAST_SPLAT]]) +; CHECK-NEXT: [[TMP5:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD9]], <32 x i8> [[WIDE_LOAD9]], <32 x i8> [[BROADCAST_SPLAT13]]) +; CHECK-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD10]], <32 x i8> [[WIDE_LOAD10]], <32 x i8> [[BROADCAST_SPLAT15]]) +; CHECK-NEXT: [[TMP7:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD11]], <32 x i8> [[WIDE_LOAD11]], <32 x i8> [[BROADCAST_SPLAT17]]) +; CHECK-NEXT: store <32 x i8> [[TMP4]], ptr [[NEXT_GEP5]], align 2 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 32 +; CHECK-NEXT: store <32 x i8> [[TMP5]], ptr [[TMP8]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 64 +; CHECK-NEXT: store <32 x i8> [[TMP6]], ptr [[TMP9]], align 2 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 96 ; CHECK-NEXT: store <32 x i8> [[TMP7]], ptr [[TMP10]], align 2 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 64 -; CHECK-NEXT: store <32 x i8> [[TMP8]], ptr [[TMP11]], align 2 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 96 -; CHECK-NEXT: store <32 x i8> [[TMP9]], ptr [[TMP12]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 128 -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[IND_END30:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]] ; CHECK-NEXT: [[IND_END27:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]] ; CHECK-NEXT: [[DOTCAST23:%.*]] = trunc i64 [[N_VEC]] to i32 ; CHECK-NEXT: [[IND_END24:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST23]] -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP2]], 112 +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 112 ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_VEC21:%.*]] = and i64 [[TMP2]], -16 +; CHECK-NEXT: [[N_VEC21:%.*]] = and i64 [[TMP0]], 4294967280 ; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC21]] to i32 ; CHECK-NEXT: [[IND_END22:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST]] ; CHECK-NEXT: [[IND_END26:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC21]] @@ -230,13 +226,13 @@ ; CHECK-NEXT: [[NEXT_GEP34:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[INDEX33]] ; CHECK-NEXT: [[NEXT_GEP35:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[INDEX33]] ; CHECK-NEXT: [[WIDE_LOAD36:%.*]] = load <16 x i8>, ptr [[NEXT_GEP34]], align 2 -; CHECK-NEXT: [[TMP14:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[WIDE_LOAD36]], <16 x i8> [[WIDE_LOAD36]], <16 x i8> [[BROADCAST_SPLAT38]]) -; CHECK-NEXT: store <16 x i8> [[TMP14]], ptr [[NEXT_GEP35]], align 2 +; CHECK-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[WIDE_LOAD36]], <16 x i8> [[WIDE_LOAD36]], <16 x i8> [[BROADCAST_SPLAT38]]) +; CHECK-NEXT: store <16 x i8> [[TMP12]], ptr [[NEXT_GEP35]], align 2 ; CHECK-NEXT: [[INDEX_NEXT39]] = add nuw i64 [[INDEX33]], 16 -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT39]], [[N_VEC21]] -; CHECK-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT39]], [[N_VEC21]] +; CHECK-NEXT: br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N32:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC21]] +; CHECK-NEXT: [[CMP_N32:%.*]] = icmp eq i64 [[N_VEC21]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N32]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL25:%.*]] = phi i32 [ [[IND_END22]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END24]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ] @@ -248,10 +244,10 @@ ; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL28]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL31]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[PSRC_ADDR_08]], i64 1 -; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[PSRC_ADDR_08]], align 2 -; CHECK-NEXT: [[TMP17:%.*]] = tail call i8 @llvm.fshl.i8(i8 [[TMP16]], i8 [[TMP16]], i8 [[OFFSET]]) +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[PSRC_ADDR_08]], align 2 +; CHECK-NEXT: [[TMP15:%.*]] = tail call i8 @llvm.fshl.i8(i8 [[TMP14]], i8 [[TMP14]], i8 [[OFFSET]]) ; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i8, ptr [[PDST_ADDR_07]], i64 1 -; CHECK-NEXT: store i8 [[TMP17]], ptr [[PDST_ADDR_07]], align 2 +; CHECK-NEXT: store i8 [[TMP15]], ptr [[PDST_ADDR_07]], align 2 ; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_09]], -1 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0 ; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP7:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll --- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll @@ -46,7 +46,7 @@ ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[TMP9:%.*]] ; CHECK: 9: -; CHECK-NEXT: br i1 poison, label [[TMP10]], label [[TMP9]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 poison, label [[TMP10]], label [[TMP9]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: 10: ; CHECK-NEXT: ret void ; @@ -78,52 +78,52 @@ ; CHECK: .lr.ph5.preheader: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[N]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw nsw i64 [[TMP3]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw nsw i64 [[TMP2]], 3 ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N_RND_UP]], 8589934588 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP3]], i64 0 +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add nsw i64 [[TMP2]], -1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ] -; CHECK-NEXT: [[TMP4:%.*]] = icmp ule <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i64 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ule <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i64 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[INDEX]] -; CHECK-NEXT: store i32 [[X:%.*]], i32* [[TMP6]], align 16 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[INDEX]] +; CHECK-NEXT: store i32 [[X:%.*]], i32* [[TMP5]], align 16 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP4]], i64 1 -; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1 +; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] ; CHECK: pred.store.if1: -; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP8]] -; CHECK-NEXT: store i32 [[X]], i32* [[TMP9]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP7]] +; CHECK-NEXT: store i32 [[X]], i32* [[TMP8]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] ; CHECK: pred.store.continue2: -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP4]], i64 2 -; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2 +; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] ; CHECK: pred.store.if3: -; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP11]] -; CHECK-NEXT: store i32 [[X]], i32* [[TMP12]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP10]] +; CHECK-NEXT: store i32 [[X]], i32* [[TMP11]], align 8 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] ; CHECK: pred.store.continue4: -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP4]], i64 3 -; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3 +; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] ; CHECK: pred.store.if5: -; CHECK-NEXT: [[TMP14:%.*]] = or i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP14]] -; CHECK-NEXT: store i32 [[X]], i32* [[TMP15]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP13]] +; CHECK-NEXT: store i32 [[X]], i32* [[TMP14]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] ; CHECK: pred.store.continue6: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[DOT_PREHEADER_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -133,16 +133,16 @@ ; CHECK-NEXT: br label [[DOTPREHEADER]] ; CHECK: .preheader: ; CHECK-NEXT: [[I_0_LCSSA:%.*]] = phi i64 [ [[PHITMP]], [[DOT_PREHEADER_CRIT_EDGE]] ], [ 0, [[TMP0:%.*]] ] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[N]], 0 -; CHECK-NEXT: br i1 [[TMP17]], label [[DOT_CRIT_EDGE:%.*]], label [[DOTLR_PH_PREHEADER:%.*]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[N]], 0 +; CHECK-NEXT: br i1 [[TMP16]], label [[DOT_CRIT_EDGE:%.*]], label [[DOTLR_PH_PREHEADER:%.*]] ; CHECK: .lr.ph.preheader: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH8:%.*]], label [[VECTOR_PH9:%.*]] ; CHECK: vector.ph9: -; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[N]], -1 -; CHECK-NEXT: [[TMP19:%.*]] = zext i32 [[TMP18]] to i64 -; CHECK-NEXT: [[N_RND_UP10:%.*]] = add nuw nsw i64 [[TMP19]], 4 +; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[N_RND_UP10:%.*]] = add nuw nsw i64 [[TMP17]], 3 ; CHECK-NEXT: [[N_VEC12:%.*]] = and i64 [[N_RND_UP10]], 8589934588 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <4 x i64> poison, i64 [[TMP19]], i64 0 +; CHECK-NEXT: [[TRIP_COUNT_MINUS_116:%.*]] = add nsw i64 [[TMP17]], -1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_116]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT18:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT17]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY19:%.*]] ; CHECK: vector.body19: @@ -151,61 +151,61 @@ ; CHECK-NEXT: [[BROADCAST_SPLATINSERT21:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX20]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT22:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT21]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[VEC_IV:%.*]] = or <4 x i64> [[BROADCAST_SPLAT22]], -; CHECK-NEXT: [[TMP20:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT18]] -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP20]], i64 0 -; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT18]] +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP18]], i64 0 +; CHECK-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] ; CHECK: pred.store.if23: -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP20]], align 4 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4 -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP24]], align 4 -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP26:%.*]] = and i32 [[TMP25]], [[TMP23]] -; CHECK-NEXT: store i32 [[TMP26]], i32* [[TMP27]], align 4 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP25:%.*]] = and i32 [[TMP23]], [[TMP21]] +; CHECK-NEXT: store i32 [[TMP25]], i32* [[TMP24]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE24]] ; CHECK: pred.store.continue24: -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[TMP20]], i64 1 -; CHECK-NEXT: br i1 [[TMP28]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[TMP18]], i64 1 +; CHECK-NEXT: br i1 [[TMP26]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] ; CHECK: pred.store.if25: -; CHECK-NEXT: [[TMP29:%.*]] = add i64 [[OFFSET_IDX]], 1 -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP29]] +; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 1 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP28]], align 4 +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[TMP27]] ; CHECK-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 -; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[TMP29]] -; CHECK-NEXT: [[TMP33:%.*]] = load i32, i32* [[TMP32]], align 4 -; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP29]] -; CHECK-NEXT: [[TMP34:%.*]] = and i32 [[TMP33]], [[TMP31]] -; CHECK-NEXT: store i32 [[TMP34]], i32* [[TMP35]], align 4 +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP27]] +; CHECK-NEXT: [[TMP33:%.*]] = and i32 [[TMP31]], [[TMP29]] +; CHECK-NEXT: store i32 [[TMP33]], i32* [[TMP32]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE26]] ; CHECK: pred.store.continue26: -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x i1> [[TMP20]], i64 2 -; CHECK-NEXT: br i1 [[TMP36]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP18]], i64 2 +; CHECK-NEXT: br i1 [[TMP34]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] ; CHECK: pred.store.if27: -; CHECK-NEXT: [[TMP37:%.*]] = add i64 [[OFFSET_IDX]], 2 -; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP37]] +; CHECK-NEXT: [[TMP35:%.*]] = add i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP35]] +; CHECK-NEXT: [[TMP37:%.*]] = load i32, i32* [[TMP36]], align 4 +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[TMP35]] ; CHECK-NEXT: [[TMP39:%.*]] = load i32, i32* [[TMP38]], align 4 -; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[TMP37]] -; CHECK-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP37]] -; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], [[TMP39]] -; CHECK-NEXT: store i32 [[TMP42]], i32* [[TMP43]], align 4 +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP35]] +; CHECK-NEXT: [[TMP41:%.*]] = and i32 [[TMP39]], [[TMP37]] +; CHECK-NEXT: store i32 [[TMP41]], i32* [[TMP40]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE28]] ; CHECK: pred.store.continue28: -; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i1> [[TMP20]], i64 3 -; CHECK-NEXT: br i1 [[TMP44]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]] +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i1> [[TMP18]], i64 3 +; CHECK-NEXT: br i1 [[TMP42]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]] ; CHECK: pred.store.if29: -; CHECK-NEXT: [[TMP45:%.*]] = add i64 [[OFFSET_IDX]], 3 -; CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP45]] +; CHECK-NEXT: [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], 3 +; CHECK-NEXT: [[TMP44:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP43]] +; CHECK-NEXT: [[TMP45:%.*]] = load i32, i32* [[TMP44]], align 4 +; CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[TMP43]] ; CHECK-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP46]], align 4 -; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[TMP45]] -; CHECK-NEXT: [[TMP49:%.*]] = load i32, i32* [[TMP48]], align 4 -; CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP45]] -; CHECK-NEXT: [[TMP50:%.*]] = and i32 [[TMP49]], [[TMP47]] -; CHECK-NEXT: store i32 [[TMP50]], i32* [[TMP51]], align 4 +; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP43]] +; CHECK-NEXT: [[TMP49:%.*]] = and i32 [[TMP47]], [[TMP45]] +; CHECK-NEXT: store i32 [[TMP49]], i32* [[TMP48]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE30]] ; CHECK: pred.store.continue30: ; CHECK-NEXT: [[INDEX_NEXT31]] = add i64 [[INDEX20]], 4 -; CHECK-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT31]], [[N_VEC12]] -; CHECK-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK7:%.*]], label [[VECTOR_BODY19]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT31]], [[N_VEC12]] +; CHECK-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK7:%.*]], label [[VECTOR_BODY19]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block7: ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH8]] ; CHECK: scalar.ph8: @@ -268,11 +268,11 @@ ; CHECK: .lr.ph.preheader: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[N]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw nsw i64 [[TMP3]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw nsw i64 [[TMP2]], 3 ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N_RND_UP]], 8589934588 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP3]], i64 0 +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add nsw i64 [[TMP2]], -1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: @@ -280,52 +280,52 @@ ; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT12]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[VEC_IV:%.*]] = or <4 x i64> [[BROADCAST_SPLAT13]], -; CHECK-NEXT: [[TMP4:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i64 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i64 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[P:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i32, i32* [[Q:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[NEXT_GEP8]], align 16 -; CHECK-NEXT: store i32 [[TMP6]], i32* [[NEXT_GEP]], align 16 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[NEXT_GEP8]], align 16 +; CHECK-NEXT: store i32 [[TMP5]], i32* [[NEXT_GEP]], align 16 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP4]], i64 1 -; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1 +; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]] ; CHECK: pred.store.if14: +; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i32, i32* [[P]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i32, i32* [[P]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = or i64 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i32, i32* [[Q]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[NEXT_GEP9]], align 16 -; CHECK-NEXT: store i32 [[TMP10]], i32* [[NEXT_GEP5]], align 16 +; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i32, i32* [[Q]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[NEXT_GEP9]], align 16 +; CHECK-NEXT: store i32 [[TMP9]], i32* [[NEXT_GEP5]], align 16 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE15]] ; CHECK: pred.store.continue15: -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP4]], i64 2 -; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2 +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]] ; CHECK: pred.store.if16: +; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i32, i32* [[P]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP12:%.*]] = or i64 [[INDEX]], 2 -; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i32, i32* [[P]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP13:%.*]] = or i64 [[INDEX]], 2 -; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i32, i32* [[Q]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[NEXT_GEP10]], align 16 -; CHECK-NEXT: store i32 [[TMP14]], i32* [[NEXT_GEP6]], align 16 +; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i32, i32* [[Q]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[NEXT_GEP10]], align 16 +; CHECK-NEXT: store i32 [[TMP13]], i32* [[NEXT_GEP6]], align 16 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE17]] ; CHECK: pred.store.continue17: -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP4]], i64 3 -; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19]] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3 +; CHECK-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19]] ; CHECK: pred.store.if18: +; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i32, i32* [[P]], i64 [[TMP15]] ; CHECK-NEXT: [[TMP16:%.*]] = or i64 [[INDEX]], 3 -; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i32, i32* [[P]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP17:%.*]] = or i64 [[INDEX]], 3 -; CHECK-NEXT: [[NEXT_GEP11:%.*]] = getelementptr i32, i32* [[Q]], i64 [[TMP17]] -; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[NEXT_GEP11]], align 16 -; CHECK-NEXT: store i32 [[TMP18]], i32* [[NEXT_GEP7]], align 16 +; CHECK-NEXT: [[NEXT_GEP11:%.*]] = getelementptr i32, i32* [[Q]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[NEXT_GEP11]], align 16 +; CHECK-NEXT: store i32 [[TMP17]], i32* [[NEXT_GEP7]], align 16 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE19]] ; CHECK: pred.store.continue19: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll --- a/llvm/test/Transforms/LoopVectorize/float-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll @@ -24,37 +24,35 @@ ; VEC4_INTERL1-NEXT: br i1 [[CMP4]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] ; VEC4_INTERL1: for.body.lr.ph: ; VEC4_INTERL1-NEXT: [[FPINC:%.*]] = load float, ptr @fp_inc, align 4 -; VEC4_INTERL1-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 -; VEC4_INTERL1-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 -; VEC4_INTERL1-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; VEC4_INTERL1-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 ; VEC4_INTERL1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 ; VEC4_INTERL1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC4_INTERL1: vector.ph: -; VEC4_INTERL1-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -4 -; VEC4_INTERL1-NEXT: [[CAST_VTC:%.*]] = sitofp i64 [[N_VEC]] to float -; VEC4_INTERL1-NEXT: [[TMP3:%.*]] = fmul fast float [[FPINC]], [[CAST_VTC]] -; VEC4_INTERL1-NEXT: [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP3]] +; VEC4_INTERL1-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967292 +; VEC4_INTERL1-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL1-NEXT: [[TMP1:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]] +; VEC4_INTERL1-NEXT: [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP1]] ; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 ; VEC4_INTERL1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[FPINC]], i64 0 ; VEC4_INTERL1-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer -; VEC4_INTERL1-NEXT: [[TMP4:%.*]] = fmul fast <4 x float> [[DOTSPLAT3]], -; VEC4_INTERL1-NEXT: [[INDUCTION:%.*]] = fsub fast <4 x float> [[DOTSPLAT]], [[TMP4]] -; VEC4_INTERL1-NEXT: [[TMP5:%.*]] = fmul fast float [[FPINC]], 4.000000e+00 -; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 +; VEC4_INTERL1-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> [[DOTSPLAT3]], +; VEC4_INTERL1-NEXT: [[INDUCTION:%.*]] = fsub fast <4 x float> [[DOTSPLAT]], [[TMP2]] +; VEC4_INTERL1-NEXT: [[TMP3:%.*]] = fmul fast float [[FPINC]], 4.000000e+00 +; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 ; VEC4_INTERL1-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT4]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL1-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC4_INTERL1: vector.body: ; VEC4_INTERL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC4_INTERL1-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; VEC4_INTERL1-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC4_INTERL1-NEXT: store <4 x float> [[VEC_IND]], ptr [[TMP6]], align 4 +; VEC4_INTERL1-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; VEC4_INTERL1-NEXT: store <4 x float> [[VEC_IND]], ptr [[TMP4]], align 4 ; VEC4_INTERL1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VEC4_INTERL1-NEXT: [[VEC_IND_NEXT]] = fsub fast <4 x float> [[VEC_IND]], [[DOTSPLAT5]] -; VEC4_INTERL1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC4_INTERL1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VEC4_INTERL1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC4_INTERL1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VEC4_INTERL1: middle.block: -; VEC4_INTERL1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; VEC4_INTERL1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; VEC4_INTERL1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; VEC4_INTERL1: scalar.ph: ; VEC4_INTERL1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_LR_PH]] ] @@ -69,7 +67,7 @@ ; VEC4_INTERL1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC4_INTERL1-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; VEC4_INTERL1-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] -; VEC4_INTERL1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; VEC4_INTERL1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; VEC4_INTERL1: for.end.loopexit: ; VEC4_INTERL1-NEXT: br label [[FOR_END]] ; VEC4_INTERL1: for.end: @@ -81,40 +79,38 @@ ; VEC4_INTERL2-NEXT: br i1 [[CMP4]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] ; VEC4_INTERL2: for.body.lr.ph: ; VEC4_INTERL2-NEXT: [[FPINC:%.*]] = load float, ptr @fp_inc, align 4 -; VEC4_INTERL2-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 -; VEC4_INTERL2-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 -; VEC4_INTERL2-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; VEC4_INTERL2-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 ; VEC4_INTERL2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8 ; VEC4_INTERL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC4_INTERL2: vector.ph: -; VEC4_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -8 -; VEC4_INTERL2-NEXT: [[CAST_VTC:%.*]] = sitofp i64 [[N_VEC]] to float -; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = fmul fast float [[FPINC]], [[CAST_VTC]] -; VEC4_INTERL2-NEXT: [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP3]] +; VEC4_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967288 +; VEC4_INTERL2-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL2-NEXT: [[TMP1:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]] +; VEC4_INTERL2-NEXT: [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP1]] ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[FPINC]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer -; VEC4_INTERL2-NEXT: [[TMP4:%.*]] = fmul fast <4 x float> [[DOTSPLAT3]], -; VEC4_INTERL2-NEXT: [[INDUCTION:%.*]] = fsub fast <4 x float> [[DOTSPLAT]], [[TMP4]] -; VEC4_INTERL2-NEXT: [[TMP5:%.*]] = fmul fast float [[FPINC]], 4.000000e+00 -; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 +; VEC4_INTERL2-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> [[DOTSPLAT3]], +; VEC4_INTERL2-NEXT: [[INDUCTION:%.*]] = fsub fast <4 x float> [[DOTSPLAT]], [[TMP2]] +; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = fmul fast float [[FPINC]], 4.000000e+00 +; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT4]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC4_INTERL2: vector.body: ; VEC4_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC4_INTERL2-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC4_INTERL2-NEXT: [[STEP_ADD:%.*]] = fsub fast <4 x float> [[VEC_IND]], [[DOTSPLAT5]] -; VEC4_INTERL2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND]], ptr [[TMP6]], align 4 -; VEC4_INTERL2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 4 -; VEC4_INTERL2-NEXT: store <4 x float> [[STEP_ADD]], ptr [[TMP8]], align 4 +; VEC4_INTERL2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND]], ptr [[TMP4]], align 4 +; VEC4_INTERL2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 4 +; VEC4_INTERL2-NEXT: store <4 x float> [[STEP_ADD]], ptr [[TMP5]], align 4 ; VEC4_INTERL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; VEC4_INTERL2-NEXT: [[VEC_IND_NEXT]] = fsub fast <4 x float> [[STEP_ADD]], [[DOTSPLAT5]] -; VEC4_INTERL2-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC4_INTERL2-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VEC4_INTERL2-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC4_INTERL2-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VEC4_INTERL2: middle.block: -; VEC4_INTERL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; VEC4_INTERL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; VEC4_INTERL2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; VEC4_INTERL2: scalar.ph: ; VEC4_INTERL2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_LR_PH]] ] @@ -129,7 +125,7 @@ ; VEC4_INTERL2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC4_INTERL2-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; VEC4_INTERL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] -; VEC4_INTERL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; VEC4_INTERL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; VEC4_INTERL2: for.end.loopexit: ; VEC4_INTERL2-NEXT: br label [[FOR_END]] ; VEC4_INTERL2: for.end: @@ -141,33 +137,31 @@ ; VEC1_INTERL2-NEXT: br i1 [[CMP4]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] ; VEC1_INTERL2: for.body.lr.ph: ; VEC1_INTERL2-NEXT: [[FPINC:%.*]] = load float, ptr @fp_inc, align 4 -; VEC1_INTERL2-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 -; VEC1_INTERL2-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 -; VEC1_INTERL2-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; VEC1_INTERL2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[TMP0]], 0 +; VEC1_INTERL2-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; VEC1_INTERL2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[N]], 1 ; VEC1_INTERL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC1_INTERL2: vector.ph: -; VEC1_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -2 -; VEC1_INTERL2-NEXT: [[CAST_VTC:%.*]] = sitofp i64 [[N_VEC]] to float -; VEC1_INTERL2-NEXT: [[TMP3:%.*]] = fmul fast float [[FPINC]], [[CAST_VTC]] -; VEC1_INTERL2-NEXT: [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP3]] +; VEC1_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967294 +; VEC1_INTERL2-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC1_INTERL2-NEXT: [[TMP1:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]] +; VEC1_INTERL2-NEXT: [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP1]] ; VEC1_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC1_INTERL2: vector.body: ; VEC1_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VEC1_INTERL2-NEXT: [[TMP4:%.*]] = sitofp i64 [[INDEX]] to float -; VEC1_INTERL2-NEXT: [[TMP5:%.*]] = fmul fast float [[FPINC]], [[TMP4]] -; VEC1_INTERL2-NEXT: [[OFFSET_IDX:%.*]] = fsub fast float [[INIT]], [[TMP5]] -; VEC1_INTERL2-NEXT: [[TMP6:%.*]] = fsub fast float [[OFFSET_IDX]], [[FPINC]] -; VEC1_INTERL2-NEXT: [[INDUCTION2:%.*]] = or i64 [[INDEX]], 1 -; VEC1_INTERL2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC1_INTERL2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDUCTION2]] -; VEC1_INTERL2-NEXT: store float [[OFFSET_IDX]], ptr [[TMP7]], align 4 -; VEC1_INTERL2-NEXT: store float [[TMP6]], ptr [[TMP8]], align 4 +; VEC1_INTERL2-NEXT: [[DOTCAST2:%.*]] = sitofp i64 [[INDEX]] to float +; VEC1_INTERL2-NEXT: [[TMP2:%.*]] = fmul fast float [[FPINC]], [[DOTCAST2]] +; VEC1_INTERL2-NEXT: [[OFFSET_IDX:%.*]] = fsub fast float [[INIT]], [[TMP2]] +; VEC1_INTERL2-NEXT: [[TMP3:%.*]] = fsub fast float [[OFFSET_IDX]], [[FPINC]] +; VEC1_INTERL2-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 1 +; VEC1_INTERL2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; VEC1_INTERL2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] +; VEC1_INTERL2-NEXT: store float [[OFFSET_IDX]], ptr [[TMP5]], align 4 +; VEC1_INTERL2-NEXT: store float [[TMP3]], ptr [[TMP6]], align 4 ; VEC1_INTERL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; VEC1_INTERL2-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC1_INTERL2-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VEC1_INTERL2-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC1_INTERL2-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VEC1_INTERL2: middle.block: -; VEC1_INTERL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; VEC1_INTERL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; VEC1_INTERL2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; VEC1_INTERL2: scalar.ph: ; VEC1_INTERL2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_LR_PH]] ] @@ -182,7 +176,7 @@ ; VEC1_INTERL2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC1_INTERL2-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; VEC1_INTERL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] -; VEC1_INTERL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; VEC1_INTERL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; VEC1_INTERL2: for.end.loopexit: ; VEC1_INTERL2-NEXT: br label [[FOR_END]] ; VEC1_INTERL2: for.end: @@ -194,37 +188,35 @@ ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[CMP4]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] ; VEC2_INTERL1_PRED_STORE: for.body.lr.ph: ; VEC2_INTERL1_PRED_STORE-NEXT: [[FPINC:%.*]] = load float, ptr @fp_inc, align 4 -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; VEC2_INTERL1_PRED_STORE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[TMP0]], 0 +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; VEC2_INTERL1_PRED_STORE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[N]], 1 ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] ; VEC2_INTERL1_PRED_STORE: vector.ph: -; VEC2_INTERL1_PRED_STORE-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -2 -; VEC2_INTERL1_PRED_STORE-NEXT: [[CAST_VTC:%.*]] = sitofp i64 [[N_VEC]] to float -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP3:%.*]] = fmul fast float [[FPINC]], [[CAST_VTC]] -; VEC2_INTERL1_PRED_STORE-NEXT: [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP3]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967294 +; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP1:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP1]] ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0 ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <2 x float> poison, float [[FPINC]], i64 0 ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT2]], <2 x float> poison, <2 x i32> zeroinitializer -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP4:%.*]] = fmul fast <2 x float> [[DOTSPLAT3]], -; VEC2_INTERL1_PRED_STORE-NEXT: [[INDUCTION:%.*]] = fsub fast <2 x float> [[DOTSPLAT]], [[TMP4]] -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP5:%.*]] = fmul fast float [[FPINC]], 2.000000e+00 -; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i64 0 +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[DOTSPLAT3]], +; VEC2_INTERL1_PRED_STORE-NEXT: [[INDUCTION:%.*]] = fsub fast <2 x float> [[DOTSPLAT]], [[TMP2]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP3:%.*]] = fmul fast float [[FPINC]], 2.000000e+00 +; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0 ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT4]], <2 x float> poison, <2 x i32> zeroinitializer ; VEC2_INTERL1_PRED_STORE-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC2_INTERL1_PRED_STORE: vector.body: ; VEC2_INTERL1_PRED_STORE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC2_INTERL1_PRED_STORE-NEXT: [[VEC_IND:%.*]] = phi <2 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC2_INTERL1_PRED_STORE-NEXT: store <2 x float> [[VEC_IND]], ptr [[TMP6]], align 4 +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; VEC2_INTERL1_PRED_STORE-NEXT: store <2 x float> [[VEC_IND]], ptr [[TMP4]], align 4 ; VEC2_INTERL1_PRED_STORE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VEC2_INTERL1_PRED_STORE-NEXT: [[VEC_IND_NEXT]] = fsub fast <2 x float> [[VEC_IND]], [[DOTSPLAT5]] -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VEC2_INTERL1_PRED_STORE: middle.block: -; VEC2_INTERL1_PRED_STORE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY]] ; VEC2_INTERL1_PRED_STORE: for.body: ; VEC2_INTERL1_PRED_STORE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_LR_PH]] ] @@ -235,7 +227,7 @@ ; VEC2_INTERL1_PRED_STORE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC2_INTERL1_PRED_STORE-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; VEC2_INTERL1_PRED_STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] -; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; VEC2_INTERL1_PRED_STORE: for.end: ; VEC2_INTERL1_PRED_STORE-NEXT: ret void ; @@ -279,37 +271,35 @@ ; VEC4_INTERL1-NEXT: br i1 [[CMP4]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] ; VEC4_INTERL1: for.body.lr.ph: ; VEC4_INTERL1-NEXT: [[FPINC:%.*]] = load float, ptr @fp_inc, align 4 -; VEC4_INTERL1-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 -; VEC4_INTERL1-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 -; VEC4_INTERL1-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; VEC4_INTERL1-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 ; VEC4_INTERL1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 ; VEC4_INTERL1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC4_INTERL1: vector.ph: -; VEC4_INTERL1-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -4 -; VEC4_INTERL1-NEXT: [[CAST_VTC:%.*]] = sitofp i64 [[N_VEC]] to float -; VEC4_INTERL1-NEXT: [[TMP3:%.*]] = fmul reassoc float [[FPINC]], [[CAST_VTC]] -; VEC4_INTERL1-NEXT: [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP3]] +; VEC4_INTERL1-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967292 +; VEC4_INTERL1-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL1-NEXT: [[TMP1:%.*]] = fmul reassoc float [[FPINC]], [[DOTCAST]] +; VEC4_INTERL1-NEXT: [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP1]] ; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 ; VEC4_INTERL1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[FPINC]], i64 0 ; VEC4_INTERL1-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer -; VEC4_INTERL1-NEXT: [[TMP4:%.*]] = fmul reassoc <4 x float> [[DOTSPLAT3]], -; VEC4_INTERL1-NEXT: [[INDUCTION:%.*]] = fsub reassoc <4 x float> [[DOTSPLAT]], [[TMP4]] -; VEC4_INTERL1-NEXT: [[TMP5:%.*]] = fmul reassoc float [[FPINC]], 4.000000e+00 -; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 +; VEC4_INTERL1-NEXT: [[TMP2:%.*]] = fmul reassoc <4 x float> [[DOTSPLAT3]], +; VEC4_INTERL1-NEXT: [[INDUCTION:%.*]] = fsub reassoc <4 x float> [[DOTSPLAT]], [[TMP2]] +; VEC4_INTERL1-NEXT: [[TMP3:%.*]] = fmul reassoc float [[FPINC]], 4.000000e+00 +; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 ; VEC4_INTERL1-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT4]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL1-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC4_INTERL1: vector.body: ; VEC4_INTERL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC4_INTERL1-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; VEC4_INTERL1-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC4_INTERL1-NEXT: store <4 x float> [[VEC_IND]], ptr [[TMP6]], align 4 +; VEC4_INTERL1-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; VEC4_INTERL1-NEXT: store <4 x float> [[VEC_IND]], ptr [[TMP4]], align 4 ; VEC4_INTERL1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VEC4_INTERL1-NEXT: [[VEC_IND_NEXT]] = fsub reassoc <4 x float> [[VEC_IND]], [[DOTSPLAT5]] -; VEC4_INTERL1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC4_INTERL1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VEC4_INTERL1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC4_INTERL1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; VEC4_INTERL1: middle.block: -; VEC4_INTERL1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; VEC4_INTERL1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; VEC4_INTERL1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; VEC4_INTERL1: scalar.ph: ; VEC4_INTERL1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_LR_PH]] ] @@ -336,40 +326,38 @@ ; VEC4_INTERL2-NEXT: br i1 [[CMP4]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] ; VEC4_INTERL2: for.body.lr.ph: ; VEC4_INTERL2-NEXT: [[FPINC:%.*]] = load float, ptr @fp_inc, align 4 -; VEC4_INTERL2-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 -; VEC4_INTERL2-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 -; VEC4_INTERL2-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; VEC4_INTERL2-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 ; VEC4_INTERL2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8 ; VEC4_INTERL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC4_INTERL2: vector.ph: -; VEC4_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -8 -; VEC4_INTERL2-NEXT: [[CAST_VTC:%.*]] = sitofp i64 [[N_VEC]] to float -; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = fmul reassoc float [[FPINC]], [[CAST_VTC]] -; VEC4_INTERL2-NEXT: [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP3]] +; VEC4_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967288 +; VEC4_INTERL2-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL2-NEXT: [[TMP1:%.*]] = fmul reassoc float [[FPINC]], [[DOTCAST]] +; VEC4_INTERL2-NEXT: [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP1]] ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[FPINC]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer -; VEC4_INTERL2-NEXT: [[TMP4:%.*]] = fmul reassoc <4 x float> [[DOTSPLAT3]], -; VEC4_INTERL2-NEXT: [[INDUCTION:%.*]] = fsub reassoc <4 x float> [[DOTSPLAT]], [[TMP4]] -; VEC4_INTERL2-NEXT: [[TMP5:%.*]] = fmul reassoc float [[FPINC]], 4.000000e+00 -; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 +; VEC4_INTERL2-NEXT: [[TMP2:%.*]] = fmul reassoc <4 x float> [[DOTSPLAT3]], +; VEC4_INTERL2-NEXT: [[INDUCTION:%.*]] = fsub reassoc <4 x float> [[DOTSPLAT]], [[TMP2]] +; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = fmul reassoc float [[FPINC]], 4.000000e+00 +; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT4]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC4_INTERL2: vector.body: ; VEC4_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC4_INTERL2-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC4_INTERL2-NEXT: [[STEP_ADD:%.*]] = fsub reassoc <4 x float> [[VEC_IND]], [[DOTSPLAT5]] -; VEC4_INTERL2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND]], ptr [[TMP6]], align 4 -; VEC4_INTERL2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 4 -; VEC4_INTERL2-NEXT: store <4 x float> [[STEP_ADD]], ptr [[TMP8]], align 4 +; VEC4_INTERL2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND]], ptr [[TMP4]], align 4 +; VEC4_INTERL2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 4 +; VEC4_INTERL2-NEXT: store <4 x float> [[STEP_ADD]], ptr [[TMP5]], align 4 ; VEC4_INTERL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; VEC4_INTERL2-NEXT: [[VEC_IND_NEXT]] = fsub reassoc <4 x float> [[STEP_ADD]], [[DOTSPLAT5]] -; VEC4_INTERL2-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC4_INTERL2-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VEC4_INTERL2-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC4_INTERL2-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; VEC4_INTERL2: middle.block: -; VEC4_INTERL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; VEC4_INTERL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; VEC4_INTERL2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; VEC4_INTERL2: scalar.ph: ; VEC4_INTERL2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_LR_PH]] ] @@ -396,35 +384,33 @@ ; VEC1_INTERL2-NEXT: br i1 [[CMP4]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] ; VEC1_INTERL2: for.body.lr.ph: ; VEC1_INTERL2-NEXT: [[FPINC:%.*]] = load float, ptr @fp_inc, align 4 -; VEC1_INTERL2-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 -; VEC1_INTERL2-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 -; VEC1_INTERL2-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; VEC1_INTERL2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[TMP0]], 0 +; VEC1_INTERL2-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; VEC1_INTERL2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[N]], 1 ; VEC1_INTERL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC1_INTERL2: vector.ph: -; VEC1_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -2 -; VEC1_INTERL2-NEXT: [[CAST_VTC:%.*]] = sitofp i64 [[N_VEC]] to float -; VEC1_INTERL2-NEXT: [[TMP3:%.*]] = fmul reassoc float [[FPINC]], [[CAST_VTC]] -; VEC1_INTERL2-NEXT: [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP3]] +; VEC1_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967294 +; VEC1_INTERL2-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC1_INTERL2-NEXT: [[TMP1:%.*]] = fmul reassoc float [[FPINC]], [[DOTCAST]] +; VEC1_INTERL2-NEXT: [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP1]] ; VEC1_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC1_INTERL2: vector.body: ; VEC1_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VEC1_INTERL2-NEXT: [[TMP4:%.*]] = sitofp i64 [[INDEX]] to float -; VEC1_INTERL2-NEXT: [[TMP5:%.*]] = fmul reassoc float [[FPINC]], [[TMP4]] -; VEC1_INTERL2-NEXT: [[OFFSET_IDX:%.*]] = fsub reassoc float [[INIT]], [[TMP5]] -; VEC1_INTERL2-NEXT: [[TMP6:%.*]] = fmul reassoc float [[FPINC]], 0.000000e+00 -; VEC1_INTERL2-NEXT: [[TMP7:%.*]] = fsub reassoc float [[OFFSET_IDX]], [[TMP6]] -; VEC1_INTERL2-NEXT: [[TMP8:%.*]] = fsub reassoc float [[OFFSET_IDX]], [[FPINC]] -; VEC1_INTERL2-NEXT: [[INDUCTION2:%.*]] = or i64 [[INDEX]], 1 -; VEC1_INTERL2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC1_INTERL2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDUCTION2]] -; VEC1_INTERL2-NEXT: store float [[TMP7]], ptr [[TMP9]], align 4 -; VEC1_INTERL2-NEXT: store float [[TMP8]], ptr [[TMP10]], align 4 +; VEC1_INTERL2-NEXT: [[DOTCAST2:%.*]] = sitofp i64 [[INDEX]] to float +; VEC1_INTERL2-NEXT: [[TMP2:%.*]] = fmul reassoc float [[FPINC]], [[DOTCAST2]] +; VEC1_INTERL2-NEXT: [[OFFSET_IDX:%.*]] = fsub reassoc float [[INIT]], [[TMP2]] +; VEC1_INTERL2-NEXT: [[TMP3:%.*]] = fmul reassoc float [[FPINC]], 0.000000e+00 +; VEC1_INTERL2-NEXT: [[TMP4:%.*]] = fsub reassoc float [[OFFSET_IDX]], [[TMP3]] +; VEC1_INTERL2-NEXT: [[TMP5:%.*]] = fsub reassoc float [[OFFSET_IDX]], [[FPINC]] +; VEC1_INTERL2-NEXT: [[TMP6:%.*]] = or i64 [[INDEX]], 1 +; VEC1_INTERL2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; VEC1_INTERL2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]] +; VEC1_INTERL2-NEXT: store float [[TMP4]], ptr [[TMP7]], align 4 +; VEC1_INTERL2-NEXT: store float [[TMP5]], ptr [[TMP8]], align 4 ; VEC1_INTERL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; VEC1_INTERL2-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC1_INTERL2-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; VEC1_INTERL2-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC1_INTERL2-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; VEC1_INTERL2: middle.block: -; VEC1_INTERL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; VEC1_INTERL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; VEC1_INTERL2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; VEC1_INTERL2: scalar.ph: ; VEC1_INTERL2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_LR_PH]] ] @@ -439,7 +425,7 @@ ; VEC1_INTERL2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC1_INTERL2-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; VEC1_INTERL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] -; VEC1_INTERL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VEC1_INTERL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; VEC1_INTERL2: for.end.loopexit: ; VEC1_INTERL2-NEXT: br label [[FOR_END]] ; VEC1_INTERL2: for.end: @@ -451,37 +437,35 @@ ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[CMP4]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] ; VEC2_INTERL1_PRED_STORE: for.body.lr.ph: ; VEC2_INTERL1_PRED_STORE-NEXT: [[FPINC:%.*]] = load float, ptr @fp_inc, align 4 -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; VEC2_INTERL1_PRED_STORE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[TMP0]], 0 +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; VEC2_INTERL1_PRED_STORE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[N]], 1 ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] ; VEC2_INTERL1_PRED_STORE: vector.ph: -; VEC2_INTERL1_PRED_STORE-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -2 -; VEC2_INTERL1_PRED_STORE-NEXT: [[CAST_VTC:%.*]] = sitofp i64 [[N_VEC]] to float -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP3:%.*]] = fmul reassoc float [[FPINC]], [[CAST_VTC]] -; VEC2_INTERL1_PRED_STORE-NEXT: [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP3]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967294 +; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP1:%.*]] = fmul reassoc float [[FPINC]], [[DOTCAST]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP1]] ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0 ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <2 x float> poison, float [[FPINC]], i64 0 ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT2]], <2 x float> poison, <2 x i32> zeroinitializer -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP4:%.*]] = fmul reassoc <2 x float> [[DOTSPLAT3]], -; VEC2_INTERL1_PRED_STORE-NEXT: [[INDUCTION:%.*]] = fsub reassoc <2 x float> [[DOTSPLAT]], [[TMP4]] -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP5:%.*]] = fmul reassoc float [[FPINC]], 2.000000e+00 -; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i64 0 +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP2:%.*]] = fmul reassoc <2 x float> [[DOTSPLAT3]], +; VEC2_INTERL1_PRED_STORE-NEXT: [[INDUCTION:%.*]] = fsub reassoc <2 x float> [[DOTSPLAT]], [[TMP2]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP3:%.*]] = fmul reassoc float [[FPINC]], 2.000000e+00 +; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i64 0 ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT4]], <2 x float> poison, <2 x i32> zeroinitializer ; VEC2_INTERL1_PRED_STORE-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC2_INTERL1_PRED_STORE: vector.body: ; VEC2_INTERL1_PRED_STORE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC2_INTERL1_PRED_STORE-NEXT: [[VEC_IND:%.*]] = phi <2 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC2_INTERL1_PRED_STORE-NEXT: store <2 x float> [[VEC_IND]], ptr [[TMP6]], align 4 +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; VEC2_INTERL1_PRED_STORE-NEXT: store <2 x float> [[VEC_IND]], ptr [[TMP4]], align 4 ; VEC2_INTERL1_PRED_STORE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VEC2_INTERL1_PRED_STORE-NEXT: [[VEC_IND_NEXT]] = fsub reassoc <2 x float> [[VEC_IND]], [[DOTSPLAT5]] -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; VEC2_INTERL1_PRED_STORE: middle.block: -; VEC2_INTERL1_PRED_STORE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY]] ; VEC2_INTERL1_PRED_STORE: for.body: ; VEC2_INTERL1_PRED_STORE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_LR_PH]] ] @@ -537,16 +521,14 @@ ; VEC4_INTERL1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; VEC4_INTERL1-NEXT: br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] ; VEC4_INTERL1: for.body.preheader: -; VEC4_INTERL1-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 -; VEC4_INTERL1-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 -; VEC4_INTERL1-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; VEC4_INTERL1-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 ; VEC4_INTERL1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 ; VEC4_INTERL1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC4_INTERL1: vector.ph: -; VEC4_INTERL1-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -4 -; VEC4_INTERL1-NEXT: [[CAST_VTC:%.*]] = sitofp i64 [[N_VEC]] to float -; VEC4_INTERL1-NEXT: [[TMP3:%.*]] = fmul fast float [[CAST_VTC]], 5.000000e-01 -; VEC4_INTERL1-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP3]], [[INIT:%.*]] +; VEC4_INTERL1-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967292 +; VEC4_INTERL1-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL1-NEXT: [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01 +; VEC4_INTERL1-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP1]], [[INIT:%.*]] ; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 ; VEC4_INTERL1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL1-NEXT: [[INDUCTION:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], @@ -554,14 +536,14 @@ ; VEC4_INTERL1: vector.body: ; VEC4_INTERL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC4_INTERL1-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; VEC4_INTERL1-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC4_INTERL1-NEXT: store <4 x float> [[VEC_IND]], ptr [[TMP4]], align 4 +; VEC4_INTERL1-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; VEC4_INTERL1-NEXT: store <4 x float> [[VEC_IND]], ptr [[TMP2]], align 4 ; VEC4_INTERL1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VEC4_INTERL1-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], -; VEC4_INTERL1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC4_INTERL1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VEC4_INTERL1-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC4_INTERL1-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; VEC4_INTERL1: middle.block: -; VEC4_INTERL1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; VEC4_INTERL1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; VEC4_INTERL1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; VEC4_INTERL1: scalar.ph: ; VEC4_INTERL1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] @@ -587,16 +569,14 @@ ; VEC4_INTERL2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; VEC4_INTERL2-NEXT: br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] ; VEC4_INTERL2: for.body.preheader: -; VEC4_INTERL2-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 -; VEC4_INTERL2-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 -; VEC4_INTERL2-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; VEC4_INTERL2-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 ; VEC4_INTERL2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8 ; VEC4_INTERL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC4_INTERL2: vector.ph: -; VEC4_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -8 -; VEC4_INTERL2-NEXT: [[CAST_VTC:%.*]] = sitofp i64 [[N_VEC]] to float -; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = fmul fast float [[CAST_VTC]], 5.000000e-01 -; VEC4_INTERL2-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP3]], [[INIT:%.*]] +; VEC4_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967288 +; VEC4_INTERL2-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL2-NEXT: [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01 +; VEC4_INTERL2-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP1]], [[INIT:%.*]] ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[INDUCTION:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], @@ -605,16 +585,16 @@ ; VEC4_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC4_INTERL2-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC4_INTERL2-NEXT: [[STEP_ADD:%.*]] = fadd fast <4 x float> [[VEC_IND]], -; VEC4_INTERL2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND]], ptr [[TMP4]], align 4 -; VEC4_INTERL2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 4 -; VEC4_INTERL2-NEXT: store <4 x float> [[STEP_ADD]], ptr [[TMP6]], align 4 +; VEC4_INTERL2-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND]], ptr [[TMP2]], align 4 +; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 4 +; VEC4_INTERL2-NEXT: store <4 x float> [[STEP_ADD]], ptr [[TMP3]], align 4 ; VEC4_INTERL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; VEC4_INTERL2-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], -; VEC4_INTERL2-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC4_INTERL2-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VEC4_INTERL2-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC4_INTERL2-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; VEC4_INTERL2: middle.block: -; VEC4_INTERL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; VEC4_INTERL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; VEC4_INTERL2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; VEC4_INTERL2: scalar.ph: ; VEC4_INTERL2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] @@ -640,33 +620,31 @@ ; VEC1_INTERL2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; VEC1_INTERL2-NEXT: br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] ; VEC1_INTERL2: for.body.preheader: -; VEC1_INTERL2-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 -; VEC1_INTERL2-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 -; VEC1_INTERL2-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; VEC1_INTERL2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[TMP0]], 0 +; VEC1_INTERL2-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; VEC1_INTERL2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[N]], 1 ; VEC1_INTERL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC1_INTERL2: vector.ph: -; VEC1_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -2 -; VEC1_INTERL2-NEXT: [[CAST_VTC:%.*]] = sitofp i64 [[N_VEC]] to float -; VEC1_INTERL2-NEXT: [[TMP3:%.*]] = fmul fast float [[CAST_VTC]], 5.000000e-01 -; VEC1_INTERL2-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP3]], [[INIT:%.*]] +; VEC1_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967294 +; VEC1_INTERL2-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC1_INTERL2-NEXT: [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01 +; VEC1_INTERL2-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP1]], [[INIT:%.*]] ; VEC1_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC1_INTERL2: vector.body: ; VEC1_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VEC1_INTERL2-NEXT: [[TMP4:%.*]] = sitofp i64 [[INDEX]] to float -; VEC1_INTERL2-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 5.000000e-01 -; VEC1_INTERL2-NEXT: [[OFFSET_IDX:%.*]] = fadd fast float [[TMP5]], [[INIT]] -; VEC1_INTERL2-NEXT: [[TMP6:%.*]] = fadd fast float [[OFFSET_IDX]], 5.000000e-01 -; VEC1_INTERL2-NEXT: [[INDUCTION2:%.*]] = or i64 [[INDEX]], 1 -; VEC1_INTERL2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC1_INTERL2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDUCTION2]] -; VEC1_INTERL2-NEXT: store float [[OFFSET_IDX]], ptr [[TMP7]], align 4 -; VEC1_INTERL2-NEXT: store float [[TMP6]], ptr [[TMP8]], align 4 +; VEC1_INTERL2-NEXT: [[DOTCAST2:%.*]] = sitofp i64 [[INDEX]] to float +; VEC1_INTERL2-NEXT: [[TMP2:%.*]] = fmul fast float [[DOTCAST2]], 5.000000e-01 +; VEC1_INTERL2-NEXT: [[OFFSET_IDX:%.*]] = fadd fast float [[TMP2]], [[INIT]] +; VEC1_INTERL2-NEXT: [[TMP3:%.*]] = fadd fast float [[OFFSET_IDX]], 5.000000e-01 +; VEC1_INTERL2-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 1 +; VEC1_INTERL2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; VEC1_INTERL2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] +; VEC1_INTERL2-NEXT: store float [[OFFSET_IDX]], ptr [[TMP5]], align 4 +; VEC1_INTERL2-NEXT: store float [[TMP3]], ptr [[TMP6]], align 4 ; VEC1_INTERL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; VEC1_INTERL2-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC1_INTERL2-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; VEC1_INTERL2-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC1_INTERL2-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; VEC1_INTERL2: middle.block: -; VEC1_INTERL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; VEC1_INTERL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; VEC1_INTERL2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; VEC1_INTERL2: scalar.ph: ; VEC1_INTERL2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] @@ -681,7 +659,7 @@ ; VEC1_INTERL2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC1_INTERL2-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; VEC1_INTERL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] -; VEC1_INTERL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VEC1_INTERL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; VEC1_INTERL2: for.end.loopexit: ; VEC1_INTERL2-NEXT: br label [[FOR_END]] ; VEC1_INTERL2: for.end: @@ -692,16 +670,14 @@ ; VEC2_INTERL1_PRED_STORE-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] ; VEC2_INTERL1_PRED_STORE: for.body.preheader: -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; VEC2_INTERL1_PRED_STORE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[TMP0]], 0 +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; VEC2_INTERL1_PRED_STORE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[N]], 1 ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] ; VEC2_INTERL1_PRED_STORE: vector.ph: -; VEC2_INTERL1_PRED_STORE-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -2 -; VEC2_INTERL1_PRED_STORE-NEXT: [[CAST_VTC:%.*]] = sitofp i64 [[N_VEC]] to float -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP3:%.*]] = fmul fast float [[CAST_VTC]], 5.000000e-01 -; VEC2_INTERL1_PRED_STORE-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP3]], [[INIT:%.*]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967294 +; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01 +; VEC2_INTERL1_PRED_STORE-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP1]], [[INIT:%.*]] ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0 ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer ; VEC2_INTERL1_PRED_STORE-NEXT: [[INDUCTION:%.*]] = fadd fast <2 x float> [[DOTSPLAT]], @@ -709,14 +685,14 @@ ; VEC2_INTERL1_PRED_STORE: vector.body: ; VEC2_INTERL1_PRED_STORE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC2_INTERL1_PRED_STORE-NEXT: [[VEC_IND:%.*]] = phi <2 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC2_INTERL1_PRED_STORE-NEXT: store <2 x float> [[VEC_IND]], ptr [[TMP4]], align 4 +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; VEC2_INTERL1_PRED_STORE-NEXT: store <2 x float> [[VEC_IND]], ptr [[TMP2]], align 4 ; VEC2_INTERL1_PRED_STORE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VEC2_INTERL1_PRED_STORE-NEXT: [[VEC_IND_NEXT]] = fadd fast <2 x float> [[VEC_IND]], -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; VEC2_INTERL1_PRED_STORE: middle.block: -; VEC2_INTERL1_PRED_STORE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY]] ; VEC2_INTERL1_PRED_STORE: for.body: ; VEC2_INTERL1_PRED_STORE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] @@ -777,27 +753,25 @@ ; VEC4_INTERL1-NEXT: br i1 [[CMP9]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] ; VEC4_INTERL1: for.body.lr.ph: ; VEC4_INTERL1-NEXT: [[TMP0:%.*]] = load float, ptr @fp_inc, align 4 -; VEC4_INTERL1-NEXT: [[TMP1:%.*]] = add i32 [[N]], -1 -; VEC4_INTERL1-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 -; VEC4_INTERL1-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; VEC4_INTERL1-NEXT: [[TMP1:%.*]] = zext i32 [[N]] to i64 ; VEC4_INTERL1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 ; VEC4_INTERL1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC4_INTERL1: vector.ph: -; VEC4_INTERL1-NEXT: [[N_VEC:%.*]] = and i64 [[TMP3]], -4 -; VEC4_INTERL1-NEXT: [[CAST_VTC:%.*]] = sitofp i64 [[N_VEC]] to float -; VEC4_INTERL1-NEXT: [[TMP4:%.*]] = fmul fast float [[CAST_VTC]], -5.000000e-01 -; VEC4_INTERL1-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP4]], 0x3FB99999A0000000 -; VEC4_INTERL1-NEXT: [[CAST_VTC2:%.*]] = sitofp i64 [[N_VEC]] to float -; VEC4_INTERL1-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP0]], [[CAST_VTC2]] -; VEC4_INTERL1-NEXT: [[IND_END3:%.*]] = fadd fast float [[TMP5]], [[INIT:%.*]] +; VEC4_INTERL1-NEXT: [[N_VEC:%.*]] = and i64 [[TMP1]], 4294967292 +; VEC4_INTERL1-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL1-NEXT: [[TMP2:%.*]] = fmul fast float [[DOTCAST]], -5.000000e-01 +; VEC4_INTERL1-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP2]], 0x3FB99999A0000000 +; VEC4_INTERL1-NEXT: [[DOTCAST2:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL1-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]] +; VEC4_INTERL1-NEXT: [[IND_END3:%.*]] = fadd fast float [[TMP3]], [[INIT:%.*]] ; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 ; VEC4_INTERL1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT5:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 ; VEC4_INTERL1-NEXT: [[DOTSPLAT6:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT5]], <4 x float> poison, <4 x i32> zeroinitializer -; VEC4_INTERL1-NEXT: [[TMP6:%.*]] = fmul fast <4 x float> [[DOTSPLAT6]], -; VEC4_INTERL1-NEXT: [[INDUCTION:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], [[TMP6]] -; VEC4_INTERL1-NEXT: [[TMP7:%.*]] = fmul fast float [[TMP0]], 4.000000e+00 -; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT7:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i64 0 +; VEC4_INTERL1-NEXT: [[TMP4:%.*]] = fmul fast <4 x float> [[DOTSPLAT6]], +; VEC4_INTERL1-NEXT: [[INDUCTION:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], [[TMP4]] +; VEC4_INTERL1-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP0]], 4.000000e+00 +; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT7:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 ; VEC4_INTERL1-NEXT: [[DOTSPLAT8:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT7]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 ; VEC4_INTERL1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer @@ -806,22 +780,22 @@ ; VEC4_INTERL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC4_INTERL1-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC4_INTERL1-NEXT: [[VEC_IND9:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT10:%.*]], [[VECTOR_BODY]] ] -; VEC4_INTERL1-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC4_INTERL1-NEXT: store <4 x float> [[VEC_IND9]], ptr [[TMP8]], align 4 -; VEC4_INTERL1-NEXT: [[TMP10:%.*]] = fadd fast <4 x float> [[VEC_IND9]], [[BROADCAST_SPLAT]] -; VEC4_INTERL1-NEXT: [[TMP11:%.*]] = fadd fast <4 x float> [[VEC_IND]], -; VEC4_INTERL1-NEXT: [[TMP12:%.*]] = fadd fast <4 x float> [[TMP11]], [[TMP10]] -; VEC4_INTERL1-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]] -; VEC4_INTERL1-NEXT: store <4 x float> [[TMP12]], ptr [[TMP13]], align 4 -; VEC4_INTERL1-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[INDEX]] -; VEC4_INTERL1-NEXT: store <4 x float> [[TMP11]], ptr [[TMP15]], align 4 +; VEC4_INTERL1-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; VEC4_INTERL1-NEXT: store <4 x float> [[VEC_IND9]], ptr [[TMP6]], align 4 +; VEC4_INTERL1-NEXT: [[TMP7:%.*]] = fadd fast <4 x float> [[VEC_IND9]], [[BROADCAST_SPLAT]] +; VEC4_INTERL1-NEXT: [[TMP8:%.*]] = fadd fast <4 x float> [[VEC_IND]], +; VEC4_INTERL1-NEXT: [[TMP9:%.*]] = fadd fast <4 x float> [[TMP8]], [[TMP7]] +; VEC4_INTERL1-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]] +; VEC4_INTERL1-NEXT: store <4 x float> [[TMP9]], ptr [[TMP10]], align 4 +; VEC4_INTERL1-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[INDEX]] +; VEC4_INTERL1-NEXT: store <4 x float> [[TMP8]], ptr [[TMP11]], align 4 ; VEC4_INTERL1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VEC4_INTERL1-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], ; VEC4_INTERL1-NEXT: [[VEC_IND_NEXT10]] = fadd fast <4 x float> [[VEC_IND9]], [[DOTSPLAT8]] -; VEC4_INTERL1-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC4_INTERL1-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; VEC4_INTERL1-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC4_INTERL1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; VEC4_INTERL1: middle.block: -; VEC4_INTERL1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; VEC4_INTERL1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP1]] ; VEC4_INTERL1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; VEC4_INTERL1: scalar.ph: ; VEC4_INTERL1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_LR_PH]] ] @@ -856,27 +830,25 @@ ; VEC4_INTERL2-NEXT: br i1 [[CMP9]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] ; VEC4_INTERL2: for.body.lr.ph: ; VEC4_INTERL2-NEXT: [[TMP0:%.*]] = load float, ptr @fp_inc, align 4 -; VEC4_INTERL2-NEXT: [[TMP1:%.*]] = add i32 [[N]], -1 -; VEC4_INTERL2-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 -; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; VEC4_INTERL2-NEXT: [[TMP1:%.*]] = zext i32 [[N]] to i64 ; VEC4_INTERL2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8 ; VEC4_INTERL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC4_INTERL2: vector.ph: -; VEC4_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP3]], -8 -; VEC4_INTERL2-NEXT: [[CAST_VTC:%.*]] = sitofp i64 [[N_VEC]] to float -; VEC4_INTERL2-NEXT: [[TMP4:%.*]] = fmul fast float [[CAST_VTC]], -5.000000e-01 -; VEC4_INTERL2-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP4]], 0x3FB99999A0000000 -; VEC4_INTERL2-NEXT: [[CAST_VTC2:%.*]] = sitofp i64 [[N_VEC]] to float -; VEC4_INTERL2-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP0]], [[CAST_VTC2]] -; VEC4_INTERL2-NEXT: [[IND_END3:%.*]] = fadd fast float [[TMP5]], [[INIT:%.*]] +; VEC4_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP1]], 4294967288 +; VEC4_INTERL2-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL2-NEXT: [[TMP2:%.*]] = fmul fast float [[DOTCAST]], -5.000000e-01 +; VEC4_INTERL2-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP2]], 0x3FB99999A0000000 +; VEC4_INTERL2-NEXT: [[DOTCAST2:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]] +; VEC4_INTERL2-NEXT: [[IND_END3:%.*]] = fadd fast float [[TMP3]], [[INIT:%.*]] ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT6:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT7:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT6]], <4 x float> poison, <4 x i32> zeroinitializer -; VEC4_INTERL2-NEXT: [[TMP6:%.*]] = fmul fast <4 x float> [[DOTSPLAT7]], -; VEC4_INTERL2-NEXT: [[INDUCTION:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], [[TMP6]] -; VEC4_INTERL2-NEXT: [[TMP7:%.*]] = fmul fast float [[TMP0]], 4.000000e+00 -; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT8:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i64 0 +; VEC4_INTERL2-NEXT: [[TMP4:%.*]] = fmul fast <4 x float> [[DOTSPLAT7]], +; VEC4_INTERL2-NEXT: [[INDUCTION:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], [[TMP4]] +; VEC4_INTERL2-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP0]], 4.000000e+00 +; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT8:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT9:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT8]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 ; VEC4_INTERL2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer @@ -888,31 +860,31 @@ ; VEC4_INTERL2-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC4_INTERL2-NEXT: [[VEC_IND10:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT13:%.*]], [[VECTOR_BODY]] ] ; VEC4_INTERL2-NEXT: [[STEP_ADD11:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[DOTSPLAT9]] -; VEC4_INTERL2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND10]], ptr [[TMP8]], align 4 -; VEC4_INTERL2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i64 4 -; VEC4_INTERL2-NEXT: store <4 x float> [[STEP_ADD11]], ptr [[TMP10]], align 4 -; VEC4_INTERL2-NEXT: [[TMP12:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[BROADCAST_SPLAT]] -; VEC4_INTERL2-NEXT: [[TMP13:%.*]] = fadd fast <4 x float> [[STEP_ADD11]], [[BROADCAST_SPLAT15]] -; VEC4_INTERL2-NEXT: [[TMP14:%.*]] = fadd fast <4 x float> [[VEC_IND]], -; VEC4_INTERL2-NEXT: [[TMP15:%.*]] = fadd fast <4 x float> [[VEC_IND]], -; VEC4_INTERL2-NEXT: [[TMP16:%.*]] = fadd fast <4 x float> [[TMP14]], [[TMP12]] -; VEC4_INTERL2-NEXT: [[TMP17:%.*]] = fadd fast <4 x float> [[TMP15]], [[TMP13]] -; VEC4_INTERL2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]] -; VEC4_INTERL2-NEXT: store <4 x float> [[TMP16]], ptr [[TMP18]], align 4 -; VEC4_INTERL2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 4 -; VEC4_INTERL2-NEXT: store <4 x float> [[TMP17]], ptr [[TMP20]], align 4 -; VEC4_INTERL2-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[INDEX]] -; VEC4_INTERL2-NEXT: store <4 x float> [[TMP14]], ptr [[TMP22]], align 4 -; VEC4_INTERL2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 4 -; VEC4_INTERL2-NEXT: store <4 x float> [[TMP15]], ptr [[TMP24]], align 4 +; VEC4_INTERL2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND10]], ptr [[TMP6]], align 4 +; VEC4_INTERL2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 4 +; VEC4_INTERL2-NEXT: store <4 x float> [[STEP_ADD11]], ptr [[TMP7]], align 4 +; VEC4_INTERL2-NEXT: [[TMP8:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[BROADCAST_SPLAT]] +; VEC4_INTERL2-NEXT: [[TMP9:%.*]] = fadd fast <4 x float> [[STEP_ADD11]], [[BROADCAST_SPLAT15]] +; VEC4_INTERL2-NEXT: [[TMP10:%.*]] = fadd fast <4 x float> [[VEC_IND]], +; VEC4_INTERL2-NEXT: [[TMP11:%.*]] = fadd fast <4 x float> [[VEC_IND]], +; VEC4_INTERL2-NEXT: [[TMP12:%.*]] = fadd fast <4 x float> [[TMP10]], [[TMP8]] +; VEC4_INTERL2-NEXT: [[TMP13:%.*]] = fadd fast <4 x float> [[TMP11]], [[TMP9]] +; VEC4_INTERL2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]] +; VEC4_INTERL2-NEXT: store <4 x float> [[TMP12]], ptr [[TMP14]], align 4 +; VEC4_INTERL2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 4 +; VEC4_INTERL2-NEXT: store <4 x float> [[TMP13]], ptr [[TMP15]], align 4 +; VEC4_INTERL2-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[INDEX]] +; VEC4_INTERL2-NEXT: store <4 x float> [[TMP10]], ptr [[TMP16]], align 4 +; VEC4_INTERL2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 4 +; VEC4_INTERL2-NEXT: store <4 x float> [[TMP11]], ptr [[TMP17]], align 4 ; VEC4_INTERL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; VEC4_INTERL2-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], ; VEC4_INTERL2-NEXT: [[VEC_IND_NEXT13]] = fadd fast <4 x float> [[STEP_ADD11]], [[DOTSPLAT9]] -; VEC4_INTERL2-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC4_INTERL2-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; VEC4_INTERL2-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC4_INTERL2-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; VEC4_INTERL2: middle.block: -; VEC4_INTERL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; VEC4_INTERL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP1]] ; VEC4_INTERL2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; VEC4_INTERL2: scalar.ph: ; VEC4_INTERL2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_LR_PH]] ] @@ -947,52 +919,50 @@ ; VEC1_INTERL2-NEXT: br i1 [[CMP9]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] ; VEC1_INTERL2: for.body.lr.ph: ; VEC1_INTERL2-NEXT: [[TMP0:%.*]] = load float, ptr @fp_inc, align 4 -; VEC1_INTERL2-NEXT: [[TMP1:%.*]] = add i32 [[N]], -1 -; VEC1_INTERL2-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 -; VEC1_INTERL2-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 -; VEC1_INTERL2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[TMP1]], 0 +; VEC1_INTERL2-NEXT: [[TMP1:%.*]] = zext i32 [[N]] to i64 +; VEC1_INTERL2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[N]], 1 ; VEC1_INTERL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC1_INTERL2: vector.ph: -; VEC1_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP3]], -2 -; VEC1_INTERL2-NEXT: [[CAST_VTC:%.*]] = sitofp i64 [[N_VEC]] to float -; VEC1_INTERL2-NEXT: [[TMP4:%.*]] = fmul fast float [[CAST_VTC]], -5.000000e-01 -; VEC1_INTERL2-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP4]], 0x3FB99999A0000000 -; VEC1_INTERL2-NEXT: [[CAST_VTC2:%.*]] = sitofp i64 [[N_VEC]] to float -; VEC1_INTERL2-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP0]], [[CAST_VTC2]] -; VEC1_INTERL2-NEXT: [[IND_END3:%.*]] = fadd fast float [[TMP5]], [[INIT:%.*]] +; VEC1_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP1]], 4294967294 +; VEC1_INTERL2-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC1_INTERL2-NEXT: [[TMP2:%.*]] = fmul fast float [[DOTCAST]], -5.000000e-01 +; VEC1_INTERL2-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP2]], 0x3FB99999A0000000 +; VEC1_INTERL2-NEXT: [[DOTCAST2:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC1_INTERL2-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]] +; VEC1_INTERL2-NEXT: [[IND_END3:%.*]] = fadd fast float [[TMP3]], [[INIT:%.*]] ; VEC1_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC1_INTERL2: vector.body: ; VEC1_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VEC1_INTERL2-NEXT: [[TMP6:%.*]] = sitofp i64 [[INDEX]] to float -; VEC1_INTERL2-NEXT: [[TMP7:%.*]] = fmul fast float [[TMP0]], [[TMP6]] -; VEC1_INTERL2-NEXT: [[OFFSET_IDX:%.*]] = fadd fast float [[TMP7]], [[INIT]] -; VEC1_INTERL2-NEXT: [[TMP8:%.*]] = fadd fast float [[OFFSET_IDX]], [[TMP0]] -; VEC1_INTERL2-NEXT: [[TMP9:%.*]] = sitofp i64 [[INDEX]] to float -; VEC1_INTERL2-NEXT: [[TMP10:%.*]] = fmul fast float [[TMP9]], -5.000000e-01 -; VEC1_INTERL2-NEXT: [[INDUCTION6:%.*]] = or i64 [[INDEX]], 1 -; VEC1_INTERL2-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC1_INTERL2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDUCTION6]] -; VEC1_INTERL2-NEXT: store float [[OFFSET_IDX]], ptr [[TMP11]], align 4 -; VEC1_INTERL2-NEXT: store float [[TMP8]], ptr [[TMP12]], align 4 -; VEC1_INTERL2-NEXT: [[TMP13:%.*]] = fadd fast float [[OFFSET_IDX]], [[TMP0]] -; VEC1_INTERL2-NEXT: [[TMP14:%.*]] = fadd fast float [[TMP8]], [[TMP0]] -; VEC1_INTERL2-NEXT: [[TMP15:%.*]] = fadd fast float [[TMP10]], 0xBFD99999A0000000 -; VEC1_INTERL2-NEXT: [[TMP16:%.*]] = fadd fast float [[TMP10]], 0xBFECCCCCC0000000 -; VEC1_INTERL2-NEXT: [[TMP17:%.*]] = fadd fast float [[TMP15]], [[TMP13]] -; VEC1_INTERL2-NEXT: [[TMP18:%.*]] = fadd fast float [[TMP16]], [[TMP14]] -; VEC1_INTERL2-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]] -; VEC1_INTERL2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDUCTION6]] -; VEC1_INTERL2-NEXT: store float [[TMP17]], ptr [[TMP19]], align 4 -; VEC1_INTERL2-NEXT: store float [[TMP18]], ptr [[TMP20]], align 4 -; VEC1_INTERL2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[INDEX]] -; VEC1_INTERL2-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[INDUCTION6]] -; VEC1_INTERL2-NEXT: store float [[TMP15]], ptr [[TMP21]], align 4 -; VEC1_INTERL2-NEXT: store float [[TMP16]], ptr [[TMP22]], align 4 +; VEC1_INTERL2-NEXT: [[DOTCAST5:%.*]] = sitofp i64 [[INDEX]] to float +; VEC1_INTERL2-NEXT: [[TMP4:%.*]] = fmul fast float [[TMP0]], [[DOTCAST5]] +; VEC1_INTERL2-NEXT: [[OFFSET_IDX:%.*]] = fadd fast float [[TMP4]], [[INIT]] +; VEC1_INTERL2-NEXT: [[TMP5:%.*]] = fadd fast float [[OFFSET_IDX]], [[TMP0]] +; VEC1_INTERL2-NEXT: [[DOTCAST6:%.*]] = sitofp i64 [[INDEX]] to float +; VEC1_INTERL2-NEXT: [[TMP6:%.*]] = fmul fast float [[DOTCAST6]], -5.000000e-01 +; VEC1_INTERL2-NEXT: [[TMP7:%.*]] = or i64 [[INDEX]], 1 +; VEC1_INTERL2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; VEC1_INTERL2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] +; VEC1_INTERL2-NEXT: store float [[OFFSET_IDX]], ptr [[TMP8]], align 4 +; VEC1_INTERL2-NEXT: store float [[TMP5]], ptr [[TMP9]], align 4 +; VEC1_INTERL2-NEXT: [[TMP10:%.*]] = fadd fast float [[OFFSET_IDX]], [[TMP0]] +; VEC1_INTERL2-NEXT: [[TMP11:%.*]] = fadd fast float [[TMP5]], [[TMP0]] +; VEC1_INTERL2-NEXT: [[TMP12:%.*]] = fadd fast float [[TMP6]], 0xBFD99999A0000000 +; VEC1_INTERL2-NEXT: [[TMP13:%.*]] = fadd fast float [[TMP6]], 0xBFECCCCCC0000000 +; VEC1_INTERL2-NEXT: [[TMP14:%.*]] = fadd fast float [[TMP12]], [[TMP10]] +; VEC1_INTERL2-NEXT: [[TMP15:%.*]] = fadd fast float [[TMP13]], [[TMP11]] +; VEC1_INTERL2-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]] +; VEC1_INTERL2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]] +; VEC1_INTERL2-NEXT: store float [[TMP14]], ptr [[TMP16]], align 4 +; VEC1_INTERL2-NEXT: store float [[TMP15]], ptr [[TMP17]], align 4 +; VEC1_INTERL2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[INDEX]] +; VEC1_INTERL2-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[TMP7]] +; VEC1_INTERL2-NEXT: store float [[TMP12]], ptr [[TMP18]], align 4 +; VEC1_INTERL2-NEXT: store float [[TMP13]], ptr [[TMP19]], align 4 ; VEC1_INTERL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; VEC1_INTERL2-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC1_INTERL2-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; VEC1_INTERL2-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC1_INTERL2-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; VEC1_INTERL2: middle.block: -; VEC1_INTERL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; VEC1_INTERL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP1]] ; VEC1_INTERL2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; VEC1_INTERL2: scalar.ph: ; VEC1_INTERL2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_LR_PH]] ] @@ -1015,7 +985,7 @@ ; VEC1_INTERL2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC1_INTERL2-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; VEC1_INTERL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] -; VEC1_INTERL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; VEC1_INTERL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; VEC1_INTERL2: for.end.loopexit: ; VEC1_INTERL2-NEXT: br label [[FOR_END]] ; VEC1_INTERL2: for.end: @@ -1027,27 +997,25 @@ ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[CMP9]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] ; VEC2_INTERL1_PRED_STORE: for.body.lr.ph: ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP0:%.*]] = load float, ptr @fp_inc, align 4 -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP1:%.*]] = add i32 [[N]], -1 -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 -; VEC2_INTERL1_PRED_STORE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[TMP1]], 0 +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP1:%.*]] = zext i32 [[N]] to i64 +; VEC2_INTERL1_PRED_STORE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[N]], 1 ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] ; VEC2_INTERL1_PRED_STORE: vector.ph: -; VEC2_INTERL1_PRED_STORE-NEXT: [[N_VEC:%.*]] = and i64 [[TMP3]], -2 -; VEC2_INTERL1_PRED_STORE-NEXT: [[CAST_VTC:%.*]] = sitofp i64 [[N_VEC]] to float -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP4:%.*]] = fmul fast float [[CAST_VTC]], -5.000000e-01 -; VEC2_INTERL1_PRED_STORE-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP4]], 0x3FB99999A0000000 -; VEC2_INTERL1_PRED_STORE-NEXT: [[CAST_VTC2:%.*]] = sitofp i64 [[N_VEC]] to float -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP0]], [[CAST_VTC2]] -; VEC2_INTERL1_PRED_STORE-NEXT: [[IND_END3:%.*]] = fadd fast float [[TMP5]], [[INIT:%.*]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[N_VEC:%.*]] = and i64 [[TMP1]], 4294967294 +; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP2:%.*]] = fmul fast float [[DOTCAST]], -5.000000e-01 +; VEC2_INTERL1_PRED_STORE-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP2]], 0x3FB99999A0000000 +; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTCAST2:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[IND_END3:%.*]] = fadd fast float [[TMP3]], [[INIT:%.*]] ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0 ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLATINSERT5:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i64 0 ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLAT6:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT5]], <2 x float> poison, <2 x i32> zeroinitializer -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP6:%.*]] = fmul fast <2 x float> [[DOTSPLAT6]], -; VEC2_INTERL1_PRED_STORE-NEXT: [[INDUCTION:%.*]] = fadd fast <2 x float> [[DOTSPLAT]], [[TMP6]] -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP7:%.*]] = fmul fast float [[TMP0]], 2.000000e+00 -; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLATINSERT7:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i64 0 +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP4:%.*]] = fmul fast <2 x float> [[DOTSPLAT6]], +; VEC2_INTERL1_PRED_STORE-NEXT: [[INDUCTION:%.*]] = fadd fast <2 x float> [[DOTSPLAT]], [[TMP4]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP0]], 2.000000e+00 +; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLATINSERT7:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i64 0 ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLAT8:%.*]] = shufflevector <2 x float> [[DOTSPLATINSERT7]], <2 x float> poison, <2 x i32> zeroinitializer ; VEC2_INTERL1_PRED_STORE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i64 0 ; VEC2_INTERL1_PRED_STORE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer @@ -1056,22 +1024,22 @@ ; VEC2_INTERL1_PRED_STORE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC2_INTERL1_PRED_STORE-NEXT: [[VEC_IND:%.*]] = phi <2 x float> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC2_INTERL1_PRED_STORE-NEXT: [[VEC_IND9:%.*]] = phi <2 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT10:%.*]], [[VECTOR_BODY]] ] -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC2_INTERL1_PRED_STORE-NEXT: store <2 x float> [[VEC_IND9]], ptr [[TMP8]], align 4 -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP10:%.*]] = fadd fast <2 x float> [[VEC_IND9]], [[BROADCAST_SPLAT]] -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP11:%.*]] = fadd fast <2 x float> [[VEC_IND]], -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP12:%.*]] = fadd fast <2 x float> [[TMP11]], [[TMP10]] -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]] -; VEC2_INTERL1_PRED_STORE-NEXT: store <2 x float> [[TMP12]], ptr [[TMP13]], align 4 -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[INDEX]] -; VEC2_INTERL1_PRED_STORE-NEXT: store <2 x float> [[TMP11]], ptr [[TMP15]], align 4 +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; VEC2_INTERL1_PRED_STORE-NEXT: store <2 x float> [[VEC_IND9]], ptr [[TMP6]], align 4 +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP7:%.*]] = fadd fast <2 x float> [[VEC_IND9]], [[BROADCAST_SPLAT]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP8:%.*]] = fadd fast <2 x float> [[VEC_IND]], +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP9:%.*]] = fadd fast <2 x float> [[TMP8]], [[TMP7]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]] +; VEC2_INTERL1_PRED_STORE-NEXT: store <2 x float> [[TMP9]], ptr [[TMP10]], align 4 +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[INDEX]] +; VEC2_INTERL1_PRED_STORE-NEXT: store <2 x float> [[TMP8]], ptr [[TMP11]], align 4 ; VEC2_INTERL1_PRED_STORE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VEC2_INTERL1_PRED_STORE-NEXT: [[VEC_IND_NEXT]] = fadd fast <2 x float> [[VEC_IND]], ; VEC2_INTERL1_PRED_STORE-NEXT: [[VEC_IND_NEXT10]] = fadd fast <2 x float> [[VEC_IND9]], [[DOTSPLAT8]] -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; VEC2_INTERL1_PRED_STORE: middle.block: -; VEC2_INTERL1_PRED_STORE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP1]] ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY]] ; VEC2_INTERL1_PRED_STORE: for.body: ; VEC2_INTERL1_PRED_STORE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_LR_PH]] ] @@ -1142,28 +1110,26 @@ ; VEC4_INTERL1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; VEC4_INTERL1-NEXT: br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] ; VEC4_INTERL1: for.body.preheader: -; VEC4_INTERL1-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 -; VEC4_INTERL1-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 -; VEC4_INTERL1-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; VEC4_INTERL1-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 ; VEC4_INTERL1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 ; VEC4_INTERL1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC4_INTERL1: vector.ph: -; VEC4_INTERL1-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -4 -; VEC4_INTERL1-NEXT: [[CAST_VTC:%.*]] = sitofp i64 [[N_VEC]] to float -; VEC4_INTERL1-NEXT: [[TMP3:%.*]] = fmul fast float [[CAST_VTC]], 5.000000e-01 -; VEC4_INTERL1-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP3]], 1.000000e+00 +; VEC4_INTERL1-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967292 +; VEC4_INTERL1-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL1-NEXT: [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01 +; VEC4_INTERL1-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP1]], 1.000000e+00 ; VEC4_INTERL1-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC4_INTERL1: vector.body: ; VEC4_INTERL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC4_INTERL1-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; VEC4_INTERL1-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC4_INTERL1-NEXT: store <4 x float> [[VEC_IND]], ptr [[TMP4]], align 4 +; VEC4_INTERL1-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; VEC4_INTERL1-NEXT: store <4 x float> [[VEC_IND]], ptr [[TMP2]], align 4 ; VEC4_INTERL1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VEC4_INTERL1-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], -; VEC4_INTERL1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC4_INTERL1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; VEC4_INTERL1-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC4_INTERL1-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; VEC4_INTERL1: middle.block: -; VEC4_INTERL1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; VEC4_INTERL1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; VEC4_INTERL1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; VEC4_INTERL1: scalar.ph: ; VEC4_INTERL1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] @@ -1189,31 +1155,29 @@ ; VEC4_INTERL2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; VEC4_INTERL2-NEXT: br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] ; VEC4_INTERL2: for.body.preheader: -; VEC4_INTERL2-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 -; VEC4_INTERL2-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 -; VEC4_INTERL2-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; VEC4_INTERL2-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 ; VEC4_INTERL2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8 ; VEC4_INTERL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC4_INTERL2: vector.ph: -; VEC4_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -8 -; VEC4_INTERL2-NEXT: [[CAST_VTC:%.*]] = sitofp i64 [[N_VEC]] to float -; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = fmul fast float [[CAST_VTC]], 5.000000e-01 -; VEC4_INTERL2-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP3]], 1.000000e+00 +; VEC4_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967288 +; VEC4_INTERL2-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL2-NEXT: [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01 +; VEC4_INTERL2-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP1]], 1.000000e+00 ; VEC4_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC4_INTERL2: vector.body: ; VEC4_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC4_INTERL2-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC4_INTERL2-NEXT: [[STEP_ADD:%.*]] = fadd fast <4 x float> [[VEC_IND]], -; VEC4_INTERL2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND]], ptr [[TMP4]], align 4 -; VEC4_INTERL2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 4 -; VEC4_INTERL2-NEXT: store <4 x float> [[STEP_ADD]], ptr [[TMP6]], align 4 +; VEC4_INTERL2-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND]], ptr [[TMP2]], align 4 +; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 4 +; VEC4_INTERL2-NEXT: store <4 x float> [[STEP_ADD]], ptr [[TMP3]], align 4 ; VEC4_INTERL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; VEC4_INTERL2-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], -; VEC4_INTERL2-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC4_INTERL2-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; VEC4_INTERL2-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC4_INTERL2-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; VEC4_INTERL2: middle.block: -; VEC4_INTERL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; VEC4_INTERL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; VEC4_INTERL2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; VEC4_INTERL2: scalar.ph: ; VEC4_INTERL2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] @@ -1239,33 +1203,31 @@ ; VEC1_INTERL2-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; VEC1_INTERL2-NEXT: br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] ; VEC1_INTERL2: for.body.preheader: -; VEC1_INTERL2-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 -; VEC1_INTERL2-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 -; VEC1_INTERL2-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; VEC1_INTERL2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[TMP0]], 0 +; VEC1_INTERL2-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; VEC1_INTERL2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[N]], 1 ; VEC1_INTERL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC1_INTERL2: vector.ph: -; VEC1_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -2 -; VEC1_INTERL2-NEXT: [[CAST_VTC:%.*]] = sitofp i64 [[N_VEC]] to float -; VEC1_INTERL2-NEXT: [[TMP3:%.*]] = fmul fast float [[CAST_VTC]], 5.000000e-01 -; VEC1_INTERL2-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP3]], 1.000000e+00 +; VEC1_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967294 +; VEC1_INTERL2-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC1_INTERL2-NEXT: [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01 +; VEC1_INTERL2-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP1]], 1.000000e+00 ; VEC1_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC1_INTERL2: vector.body: ; VEC1_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; VEC1_INTERL2-NEXT: [[TMP4:%.*]] = sitofp i64 [[INDEX]] to float -; VEC1_INTERL2-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 5.000000e-01 -; VEC1_INTERL2-NEXT: [[OFFSET_IDX:%.*]] = fadd fast float [[TMP5]], 1.000000e+00 -; VEC1_INTERL2-NEXT: [[TMP6:%.*]] = fadd fast float [[TMP5]], 1.500000e+00 -; VEC1_INTERL2-NEXT: [[INDUCTION2:%.*]] = or i64 [[INDEX]], 1 -; VEC1_INTERL2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC1_INTERL2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDUCTION2]] -; VEC1_INTERL2-NEXT: store float [[OFFSET_IDX]], ptr [[TMP7]], align 4 -; VEC1_INTERL2-NEXT: store float [[TMP6]], ptr [[TMP8]], align 4 +; VEC1_INTERL2-NEXT: [[DOTCAST2:%.*]] = sitofp i64 [[INDEX]] to float +; VEC1_INTERL2-NEXT: [[TMP2:%.*]] = fmul fast float [[DOTCAST2]], 5.000000e-01 +; VEC1_INTERL2-NEXT: [[OFFSET_IDX:%.*]] = fadd fast float [[TMP2]], 1.000000e+00 +; VEC1_INTERL2-NEXT: [[TMP3:%.*]] = fadd fast float [[TMP2]], 1.500000e+00 +; VEC1_INTERL2-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 1 +; VEC1_INTERL2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; VEC1_INTERL2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP4]] +; VEC1_INTERL2-NEXT: store float [[OFFSET_IDX]], ptr [[TMP5]], align 4 +; VEC1_INTERL2-NEXT: store float [[TMP3]], ptr [[TMP6]], align 4 ; VEC1_INTERL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; VEC1_INTERL2-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC1_INTERL2-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; VEC1_INTERL2-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC1_INTERL2-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; VEC1_INTERL2: middle.block: -; VEC1_INTERL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; VEC1_INTERL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; VEC1_INTERL2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; VEC1_INTERL2: scalar.ph: ; VEC1_INTERL2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] @@ -1280,7 +1242,7 @@ ; VEC1_INTERL2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VEC1_INTERL2-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; VEC1_INTERL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] -; VEC1_INTERL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; VEC1_INTERL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; VEC1_INTERL2: for.end.loopexit: ; VEC1_INTERL2-NEXT: br label [[FOR_END]] ; VEC1_INTERL2: for.end: @@ -1291,28 +1253,26 @@ ; VEC2_INTERL1_PRED_STORE-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] ; VEC2_INTERL1_PRED_STORE: for.body.preheader: -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; VEC2_INTERL1_PRED_STORE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[TMP0]], 0 +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; VEC2_INTERL1_PRED_STORE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[N]], 1 ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] ; VEC2_INTERL1_PRED_STORE: vector.ph: -; VEC2_INTERL1_PRED_STORE-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -2 -; VEC2_INTERL1_PRED_STORE-NEXT: [[CAST_VTC:%.*]] = sitofp i64 [[N_VEC]] to float -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP3:%.*]] = fmul fast float [[CAST_VTC]], 5.000000e-01 -; VEC2_INTERL1_PRED_STORE-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP3]], 1.000000e+00 +; VEC2_INTERL1_PRED_STORE-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967294 +; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01 +; VEC2_INTERL1_PRED_STORE-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP1]], 1.000000e+00 ; VEC2_INTERL1_PRED_STORE-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC2_INTERL1_PRED_STORE: vector.body: ; VEC2_INTERL1_PRED_STORE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC2_INTERL1_PRED_STORE-NEXT: [[VEC_IND:%.*]] = phi <2 x float> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC2_INTERL1_PRED_STORE-NEXT: store <2 x float> [[VEC_IND]], ptr [[TMP4]], align 4 +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; VEC2_INTERL1_PRED_STORE-NEXT: store <2 x float> [[VEC_IND]], ptr [[TMP2]], align 4 ; VEC2_INTERL1_PRED_STORE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VEC2_INTERL1_PRED_STORE-NEXT: [[VEC_IND_NEXT]] = fadd fast <2 x float> [[VEC_IND]], -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; VEC2_INTERL1_PRED_STORE: middle.block: -; VEC2_INTERL1_PRED_STORE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY]] ; VEC2_INTERL1_PRED_STORE: for.body: ; VEC2_INTERL1_PRED_STORE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] @@ -1361,57 +1321,57 @@ ; VEC4_INTERL1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC4_INTERL1: vector.ph: ; VEC4_INTERL1-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804 -; VEC4_INTERL1-NEXT: [[CAST_VTC:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL1-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float ; VEC4_INTERL1-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC4_INTERL1: vector.body: -; VEC4_INTERL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE7:%.*]] ] -; VEC4_INTERL1-NEXT: [[TMP0:%.*]] = sitofp i64 [[INDEX]] to float -; VEC4_INTERL1-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC4_INTERL1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 -; VEC4_INTERL1-NEXT: [[TMP3:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD]], zeroinitializer -; VEC4_INTERL1-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i64 0 -; VEC4_INTERL1-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; VEC4_INTERL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ] +; VEC4_INTERL1-NEXT: [[DOTCAST2:%.*]] = sitofp i64 [[INDEX]] to float +; VEC4_INTERL1-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; VEC4_INTERL1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4 +; VEC4_INTERL1-NEXT: [[TMP1:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD]], zeroinitializer +; VEC4_INTERL1-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i64 0 +; VEC4_INTERL1-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; VEC4_INTERL1: pred.store.if: -; VEC4_INTERL1-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]] -; VEC4_INTERL1-NEXT: store float [[TMP0]], ptr [[TMP5]], align 4 +; VEC4_INTERL1-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]] +; VEC4_INTERL1-NEXT: store float [[DOTCAST2]], ptr [[TMP3]], align 4 ; VEC4_INTERL1-NEXT: br label [[PRED_STORE_CONTINUE]] ; VEC4_INTERL1: pred.store.continue: -; VEC4_INTERL1-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1 -; VEC4_INTERL1-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3:%.*]] +; VEC4_INTERL1-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP1]], i64 1 +; VEC4_INTERL1-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] ; VEC4_INTERL1: pred.store.if3: -; VEC4_INTERL1-NEXT: [[TMP8:%.*]] = or i64 [[INDEX]], 1 -; VEC4_INTERL1-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] -; VEC4_INTERL1-NEXT: [[TMP7:%.*]] = fadd fast float [[TMP0]], 1.000000e+00 -; VEC4_INTERL1-NEXT: store float [[TMP7]], ptr [[TMP9]], align 4 -; VEC4_INTERL1-NEXT: br label [[PRED_STORE_CONTINUE3]] +; VEC4_INTERL1-NEXT: [[TMP5:%.*]] = or i64 [[INDEX]], 1 +; VEC4_INTERL1-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]] +; VEC4_INTERL1-NEXT: [[TMP7:%.*]] = fadd fast float [[DOTCAST2]], 1.000000e+00 +; VEC4_INTERL1-NEXT: store float [[TMP7]], ptr [[TMP6]], align 4 +; VEC4_INTERL1-NEXT: br label [[PRED_STORE_CONTINUE4]] ; VEC4_INTERL1: pred.store.continue4: -; VEC4_INTERL1-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2 -; VEC4_INTERL1-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]] +; VEC4_INTERL1-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP1]], i64 2 +; VEC4_INTERL1-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] ; VEC4_INTERL1: pred.store.if5: -; VEC4_INTERL1-NEXT: [[TMP12:%.*]] = or i64 [[INDEX]], 2 -; VEC4_INTERL1-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP12]] -; VEC4_INTERL1-NEXT: [[TMP11:%.*]] = fadd fast float [[TMP0]], 2.000000e+00 -; VEC4_INTERL1-NEXT: store float [[TMP11]], ptr [[TMP13]], align 4 -; VEC4_INTERL1-NEXT: br label [[PRED_STORE_CONTINUE5]] +; VEC4_INTERL1-NEXT: [[TMP9:%.*]] = or i64 [[INDEX]], 2 +; VEC4_INTERL1-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] +; VEC4_INTERL1-NEXT: [[TMP11:%.*]] = fadd fast float [[DOTCAST2]], 2.000000e+00 +; VEC4_INTERL1-NEXT: store float [[TMP11]], ptr [[TMP10]], align 4 +; VEC4_INTERL1-NEXT: br label [[PRED_STORE_CONTINUE6]] ; VEC4_INTERL1: pred.store.continue6: -; VEC4_INTERL1-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3 -; VEC4_INTERL1-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7]] +; VEC4_INTERL1-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP1]], i64 3 +; VEC4_INTERL1-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]] ; VEC4_INTERL1: pred.store.if7: -; VEC4_INTERL1-NEXT: [[TMP16:%.*]] = or i64 [[INDEX]], 3 -; VEC4_INTERL1-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP16]] -; VEC4_INTERL1-NEXT: [[TMP15:%.*]] = fadd fast float [[TMP0]], 3.000000e+00 -; VEC4_INTERL1-NEXT: store float [[TMP15]], ptr [[TMP17]], align 4 -; VEC4_INTERL1-NEXT: br label [[PRED_STORE_CONTINUE7]] +; VEC4_INTERL1-NEXT: [[TMP13:%.*]] = or i64 [[INDEX]], 3 +; VEC4_INTERL1-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]] +; VEC4_INTERL1-NEXT: [[TMP15:%.*]] = fadd fast float [[DOTCAST2]], 3.000000e+00 +; VEC4_INTERL1-NEXT: store float [[TMP15]], ptr [[TMP14]], align 4 +; VEC4_INTERL1-NEXT: br label [[PRED_STORE_CONTINUE8]] ; VEC4_INTERL1: pred.store.continue8: ; VEC4_INTERL1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; VEC4_INTERL1-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC4_INTERL1-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; VEC4_INTERL1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC4_INTERL1-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; VEC4_INTERL1: middle.block: ; VEC4_INTERL1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; VEC4_INTERL1-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; VEC4_INTERL1: scalar.ph: ; VEC4_INTERL1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; VEC4_INTERL1-NEXT: [[BC_RESUME_VAL1:%.*]] = phi float [ [[CAST_VTC]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; VEC4_INTERL1-NEXT: [[BC_RESUME_VAL1:%.*]] = phi float [ [[DOTCAST]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] ; VEC4_INTERL1-NEXT: br label [[FOR_BODY:%.*]] ; VEC4_INTERL1: for.body: ; VEC4_INTERL1-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -1438,96 +1398,96 @@ ; VEC4_INTERL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC4_INTERL2: vector.ph: ; VEC4_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775800 -; VEC4_INTERL2-NEXT: [[CAST_VTC:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL2-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float ; VEC4_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC4_INTERL2: vector.body: -; VEC4_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE16:%.*]] ] -; VEC4_INTERL2-NEXT: [[TMP0:%.*]] = sitofp i64 [[INDEX]] to float -; VEC4_INTERL2-NEXT: [[TMP1:%.*]] = or i64 [[INDEX]], 4 -; VEC4_INTERL2-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC4_INTERL2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 -; VEC4_INTERL2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 4 -; VEC4_INTERL2-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 -; VEC4_INTERL2-NEXT: [[TMP6:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD]], zeroinitializer -; VEC4_INTERL2-NEXT: [[TMP7:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD2]], zeroinitializer -; VEC4_INTERL2-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP6]], i64 0 -; VEC4_INTERL2-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; VEC4_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE17:%.*]] ] +; VEC4_INTERL2-NEXT: [[DOTCAST2:%.*]] = sitofp i64 [[INDEX]] to float +; VEC4_INTERL2-NEXT: [[TMP0:%.*]] = or i64 [[INDEX]], 4 +; VEC4_INTERL2-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; VEC4_INTERL2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 +; VEC4_INTERL2-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 4 +; VEC4_INTERL2-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD]], zeroinitializer +; VEC4_INTERL2-NEXT: [[TMP4:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD3]], zeroinitializer +; VEC4_INTERL2-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP3]], i64 0 +; VEC4_INTERL2-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; VEC4_INTERL2: pred.store.if: -; VEC4_INTERL2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]] -; VEC4_INTERL2-NEXT: store float [[TMP0]], ptr [[TMP9]], align 4 +; VEC4_INTERL2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]] +; VEC4_INTERL2-NEXT: store float [[DOTCAST2]], ptr [[TMP6]], align 4 ; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE]] ; VEC4_INTERL2: pred.store.continue: -; VEC4_INTERL2-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP6]], i64 1 -; VEC4_INTERL2-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] +; VEC4_INTERL2-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1 +; VEC4_INTERL2-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]] ; VEC4_INTERL2: pred.store.if4: -; VEC4_INTERL2-NEXT: [[TMP12:%.*]] = or i64 [[INDEX]], 1 -; VEC4_INTERL2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP12]] -; VEC4_INTERL2-NEXT: [[TMP11:%.*]] = fadd fast float [[TMP0]], 1.000000e+00 -; VEC4_INTERL2-NEXT: store float [[TMP11]], ptr [[TMP13]], align 4 -; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE4]] +; VEC4_INTERL2-NEXT: [[TMP8:%.*]] = or i64 [[INDEX]], 1 +; VEC4_INTERL2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] +; VEC4_INTERL2-NEXT: [[TMP10:%.*]] = fadd fast float [[DOTCAST2]], 1.000000e+00 +; VEC4_INTERL2-NEXT: store float [[TMP10]], ptr [[TMP9]], align 4 +; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE5]] ; VEC4_INTERL2: pred.store.continue5: -; VEC4_INTERL2-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP6]], i64 2 -; VEC4_INTERL2-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] +; VEC4_INTERL2-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2 +; VEC4_INTERL2-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]] ; VEC4_INTERL2: pred.store.if6: -; VEC4_INTERL2-NEXT: [[TMP16:%.*]] = or i64 [[INDEX]], 2 -; VEC4_INTERL2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP16]] -; VEC4_INTERL2-NEXT: [[TMP15:%.*]] = fadd fast float [[TMP0]], 2.000000e+00 -; VEC4_INTERL2-NEXT: store float [[TMP15]], ptr [[TMP17]], align 4 -; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE6]] +; VEC4_INTERL2-NEXT: [[TMP12:%.*]] = or i64 [[INDEX]], 2 +; VEC4_INTERL2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP12]] +; VEC4_INTERL2-NEXT: [[TMP14:%.*]] = fadd fast float [[DOTCAST2]], 2.000000e+00 +; VEC4_INTERL2-NEXT: store float [[TMP14]], ptr [[TMP13]], align 4 +; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE7]] ; VEC4_INTERL2: pred.store.continue7: -; VEC4_INTERL2-NEXT: [[TMP18:%.*]] = extractelement <4 x i1> [[TMP6]], i64 3 -; VEC4_INTERL2-NEXT: br i1 [[TMP18]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] +; VEC4_INTERL2-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3 +; VEC4_INTERL2-NEXT: br i1 [[TMP15]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]] ; VEC4_INTERL2: pred.store.if8: -; VEC4_INTERL2-NEXT: [[TMP20:%.*]] = or i64 [[INDEX]], 3 -; VEC4_INTERL2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP20]] -; VEC4_INTERL2-NEXT: [[TMP19:%.*]] = fadd fast float [[TMP0]], 3.000000e+00 -; VEC4_INTERL2-NEXT: store float [[TMP19]], ptr [[TMP21]], align 4 -; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE8]] +; VEC4_INTERL2-NEXT: [[TMP16:%.*]] = or i64 [[INDEX]], 3 +; VEC4_INTERL2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP16]] +; VEC4_INTERL2-NEXT: [[TMP18:%.*]] = fadd fast float [[DOTCAST2]], 3.000000e+00 +; VEC4_INTERL2-NEXT: store float [[TMP18]], ptr [[TMP17]], align 4 +; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE9]] ; VEC4_INTERL2: pred.store.continue9: -; VEC4_INTERL2-NEXT: [[TMP22:%.*]] = extractelement <4 x i1> [[TMP7]], i64 0 -; VEC4_INTERL2-NEXT: br i1 [[TMP22]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] +; VEC4_INTERL2-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP4]], i64 0 +; VEC4_INTERL2-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]] ; VEC4_INTERL2: pred.store.if10: -; VEC4_INTERL2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] -; VEC4_INTERL2-NEXT: [[TMP23:%.*]] = fadd fast float [[TMP0]], 4.000000e+00 -; VEC4_INTERL2-NEXT: store float [[TMP23]], ptr [[TMP24]], align 4 -; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE10]] +; VEC4_INTERL2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] +; VEC4_INTERL2-NEXT: [[TMP21:%.*]] = fadd fast float [[DOTCAST2]], 4.000000e+00 +; VEC4_INTERL2-NEXT: store float [[TMP21]], ptr [[TMP20]], align 4 +; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE11]] ; VEC4_INTERL2: pred.store.continue11: -; VEC4_INTERL2-NEXT: [[TMP25:%.*]] = extractelement <4 x i1> [[TMP7]], i64 1 -; VEC4_INTERL2-NEXT: br i1 [[TMP25]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]] +; VEC4_INTERL2-NEXT: [[TMP22:%.*]] = extractelement <4 x i1> [[TMP4]], i64 1 +; VEC4_INTERL2-NEXT: br i1 [[TMP22]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]] ; VEC4_INTERL2: pred.store.if12: -; VEC4_INTERL2-NEXT: [[TMP27:%.*]] = or i64 [[INDEX]], 5 -; VEC4_INTERL2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP27]] -; VEC4_INTERL2-NEXT: [[TMP26:%.*]] = fadd fast float [[TMP0]], 5.000000e+00 -; VEC4_INTERL2-NEXT: store float [[TMP26]], ptr [[TMP28]], align 4 -; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE12]] +; VEC4_INTERL2-NEXT: [[TMP23:%.*]] = or i64 [[INDEX]], 5 +; VEC4_INTERL2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP23]] +; VEC4_INTERL2-NEXT: [[TMP25:%.*]] = fadd fast float [[DOTCAST2]], 5.000000e+00 +; VEC4_INTERL2-NEXT: store float [[TMP25]], ptr [[TMP24]], align 4 +; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE13]] ; VEC4_INTERL2: pred.store.continue13: -; VEC4_INTERL2-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[TMP7]], i64 2 -; VEC4_INTERL2-NEXT: br i1 [[TMP29]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]] +; VEC4_INTERL2-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[TMP4]], i64 2 +; VEC4_INTERL2-NEXT: br i1 [[TMP26]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]] ; VEC4_INTERL2: pred.store.if14: -; VEC4_INTERL2-NEXT: [[TMP31:%.*]] = or i64 [[INDEX]], 6 -; VEC4_INTERL2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP31]] -; VEC4_INTERL2-NEXT: [[TMP30:%.*]] = fadd fast float [[TMP0]], 6.000000e+00 -; VEC4_INTERL2-NEXT: store float [[TMP30]], ptr [[TMP32]], align 4 -; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE14]] +; VEC4_INTERL2-NEXT: [[TMP27:%.*]] = or i64 [[INDEX]], 6 +; VEC4_INTERL2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP27]] +; VEC4_INTERL2-NEXT: [[TMP29:%.*]] = fadd fast float [[DOTCAST2]], 6.000000e+00 +; VEC4_INTERL2-NEXT: store float [[TMP29]], ptr [[TMP28]], align 4 +; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE15]] ; VEC4_INTERL2: pred.store.continue15: -; VEC4_INTERL2-NEXT: [[TMP33:%.*]] = extractelement <4 x i1> [[TMP7]], i64 3 -; VEC4_INTERL2-NEXT: br i1 [[TMP33]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16]] +; VEC4_INTERL2-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP4]], i64 3 +; VEC4_INTERL2-NEXT: br i1 [[TMP30]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17]] ; VEC4_INTERL2: pred.store.if16: -; VEC4_INTERL2-NEXT: [[TMP35:%.*]] = or i64 [[INDEX]], 7 -; VEC4_INTERL2-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP35]] -; VEC4_INTERL2-NEXT: [[TMP34:%.*]] = fadd fast float [[TMP0]], 7.000000e+00 -; VEC4_INTERL2-NEXT: store float [[TMP34]], ptr [[TMP36]], align 4 -; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE16]] +; VEC4_INTERL2-NEXT: [[TMP31:%.*]] = or i64 [[INDEX]], 7 +; VEC4_INTERL2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP31]] +; VEC4_INTERL2-NEXT: [[TMP33:%.*]] = fadd fast float [[DOTCAST2]], 7.000000e+00 +; VEC4_INTERL2-NEXT: store float [[TMP33]], ptr [[TMP32]], align 4 +; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE17]] ; VEC4_INTERL2: pred.store.continue17: ; VEC4_INTERL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; VEC4_INTERL2-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC4_INTERL2-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; VEC4_INTERL2-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC4_INTERL2-NEXT: br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; VEC4_INTERL2: middle.block: ; VEC4_INTERL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; VEC4_INTERL2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; VEC4_INTERL2: scalar.ph: ; VEC4_INTERL2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; VEC4_INTERL2-NEXT: [[BC_RESUME_VAL1:%.*]] = phi float [ [[CAST_VTC]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; VEC4_INTERL2-NEXT: [[BC_RESUME_VAL1:%.*]] = phi float [ [[DOTCAST]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] ; VEC4_INTERL2-NEXT: br label [[FOR_BODY:%.*]] ; VEC4_INTERL2: for.body: ; VEC4_INTERL2-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -1554,38 +1514,38 @@ ; VEC1_INTERL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC1_INTERL2: vector.ph: ; VEC1_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775806 -; VEC1_INTERL2-NEXT: [[CAST_VTC:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC1_INTERL2-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float ; VEC1_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC1_INTERL2: vector.body: ; VEC1_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ] -; VEC1_INTERL2-NEXT: [[TMP0:%.*]] = sitofp i64 [[INDEX]] to float -; VEC1_INTERL2-NEXT: [[INDUCTION2:%.*]] = or i64 [[INDEX]], 1 +; VEC1_INTERL2-NEXT: [[DOTCAST2:%.*]] = sitofp i64 [[INDEX]] to float +; VEC1_INTERL2-NEXT: [[TMP0:%.*]] = or i64 [[INDEX]], 1 ; VEC1_INTERL2-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC1_INTERL2-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDUCTION2]] +; VEC1_INTERL2-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP0]] ; VEC1_INTERL2-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1]], align 4 ; VEC1_INTERL2-NEXT: [[TMP4:%.*]] = load float, ptr [[TMP2]], align 4 ; VEC1_INTERL2-NEXT: [[TMP5:%.*]] = fcmp fast oeq float [[TMP3]], 0.000000e+00 ; VEC1_INTERL2-NEXT: [[TMP6:%.*]] = fcmp fast oeq float [[TMP4]], 0.000000e+00 ; VEC1_INTERL2-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; VEC1_INTERL2: pred.store.if: -; VEC1_INTERL2-NEXT: store float [[TMP0]], ptr [[TMP1]], align 4 +; VEC1_INTERL2-NEXT: store float [[DOTCAST2]], ptr [[TMP1]], align 4 ; VEC1_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE]] ; VEC1_INTERL2: pred.store.continue: ; VEC1_INTERL2-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]] ; VEC1_INTERL2: pred.store.if3: -; VEC1_INTERL2-NEXT: [[TMP7:%.*]] = fadd fast float [[TMP0]], 1.000000e+00 +; VEC1_INTERL2-NEXT: [[TMP7:%.*]] = fadd fast float [[DOTCAST2]], 1.000000e+00 ; VEC1_INTERL2-NEXT: store float [[TMP7]], ptr [[TMP2]], align 4 ; VEC1_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE4]] ; VEC1_INTERL2: pred.store.continue4: ; VEC1_INTERL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VEC1_INTERL2-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC1_INTERL2-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; VEC1_INTERL2-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; VEC1_INTERL2: middle.block: ; VEC1_INTERL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; VEC1_INTERL2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; VEC1_INTERL2: scalar.ph: ; VEC1_INTERL2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; VEC1_INTERL2-NEXT: [[BC_RESUME_VAL1:%.*]] = phi float [ [[CAST_VTC]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; VEC1_INTERL2-NEXT: [[BC_RESUME_VAL1:%.*]] = phi float [ [[DOTCAST]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] ; VEC1_INTERL2-NEXT: br label [[FOR_BODY:%.*]] ; VEC1_INTERL2: for.body: ; VEC1_INTERL2-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -1601,7 +1561,7 @@ ; VEC1_INTERL2-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; VEC1_INTERL2-NEXT: [[J_NEXT]] = fadd fast float [[J]], 1.000000e+00 ; VEC1_INTERL2-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; VEC1_INTERL2-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]] +; VEC1_INTERL2-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP13:![0-9]+]] ; VEC1_INTERL2: for.end: ; VEC1_INTERL2-NEXT: ret void ; @@ -1612,39 +1572,39 @@ ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] ; VEC2_INTERL1_PRED_STORE: vector.ph: ; VEC2_INTERL1_PRED_STORE-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775806 -; VEC2_INTERL1_PRED_STORE-NEXT: [[CAST_VTC:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float ; VEC2_INTERL1_PRED_STORE-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC2_INTERL1_PRED_STORE: vector.body: -; VEC2_INTERL1_PRED_STORE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP0:%.*]] = sitofp i64 [[INDEX]] to float -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC2_INTERL1_PRED_STORE-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4 -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP3:%.*]] = fcmp fast oeq <2 x float> [[WIDE_LOAD]], zeroinitializer -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i64 0 -; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ] +; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTCAST2:%.*]] = sitofp i64 [[INDEX]] to float +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP0]], align 4 +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP1:%.*]] = fcmp fast oeq <2 x float> [[WIDE_LOAD]], zeroinitializer +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i64 0 +; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; VEC2_INTERL1_PRED_STORE: pred.store.if: -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]] -; VEC2_INTERL1_PRED_STORE-NEXT: store float [[TMP0]], ptr [[TMP5]], align 4 +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]] +; VEC2_INTERL1_PRED_STORE-NEXT: store float [[DOTCAST2]], ptr [[TMP3]], align 4 ; VEC2_INTERL1_PRED_STORE-NEXT: br label [[PRED_STORE_CONTINUE]] ; VEC2_INTERL1_PRED_STORE: pred.store.continue: -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP3]], i64 1 -; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i64 1 +; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]] ; VEC2_INTERL1_PRED_STORE: pred.store.if3: -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP8:%.*]] = or i64 [[INDEX]], 1 -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP7:%.*]] = fadd fast float [[TMP0]], 1.000000e+00 -; VEC2_INTERL1_PRED_STORE-NEXT: store float [[TMP7]], ptr [[TMP9]], align 4 -; VEC2_INTERL1_PRED_STORE-NEXT: br label [[PRED_STORE_CONTINUE3]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP5:%.*]] = or i64 [[INDEX]], 1 +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP5]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP7:%.*]] = fadd fast float [[DOTCAST2]], 1.000000e+00 +; VEC2_INTERL1_PRED_STORE-NEXT: store float [[TMP7]], ptr [[TMP6]], align 4 +; VEC2_INTERL1_PRED_STORE-NEXT: br label [[PRED_STORE_CONTINUE4]] ; VEC2_INTERL1_PRED_STORE: pred.store.continue4: ; VEC2_INTERL1_PRED_STORE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; VEC2_INTERL1_PRED_STORE: middle.block: ; VEC2_INTERL1_PRED_STORE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; VEC2_INTERL1_PRED_STORE: for.body: ; VEC2_INTERL1_PRED_STORE-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; VEC2_INTERL1_PRED_STORE-NEXT: [[J:%.*]] = phi float [ [[J_NEXT:%.*]], [[FOR_INC]] ], [ [[CAST_VTC]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; VEC2_INTERL1_PRED_STORE-NEXT: [[J:%.*]] = phi float [ [[J_NEXT:%.*]], [[FOR_INC]] ], [ [[DOTCAST]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] ; VEC2_INTERL1_PRED_STORE-NEXT: [[VAR0:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[I]] ; VEC2_INTERL1_PRED_STORE-NEXT: [[VAR1:%.*]] = load float, ptr [[VAR0]], align 4 ; VEC2_INTERL1_PRED_STORE-NEXT: [[VAR2:%.*]] = fcmp fast oeq float [[VAR1]], 0.000000e+00 diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll --- a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll +++ b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll @@ -9,46 +9,44 @@ ; CHECK-NEXT: [[CMP26:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP26]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] ; CHECK: for.body.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[N]], -1 -; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 -; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 4 -; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 4 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP4]] ; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP1]], [[A]] ; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[B]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -4 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967292 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4, !alias.scope !0, !noalias !3 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4, !alias.scope !3 -; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP11:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD2]], -; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> , <4 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = and <4 x i1> [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <4 x i1> [[TMP10]], -; CHECK-NEXT: [[TMP15:%.*]] = and <4 x i1> [[TMP9]], [[TMP14]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP13]], <4 x i32> , <4 x i32> -; CHECK-NEXT: [[PREDPHI3:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> [[TMP12]], <4 x i32> [[PREDPHI]] -; CHECK-NEXT: store <4 x i32> [[PREDPHI3]], ptr [[TMP7]], align 4, !alias.scope !0, !noalias !3 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4, !alias.scope !0, !noalias !3 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4, !alias.scope !3 +; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP9:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD2]], +; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = and <4 x i1> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[TMP8]], +; CHECK-NEXT: [[TMP13:%.*]] = and <4 x i1> [[TMP7]], [[TMP12]] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[PREDPHI3:%.*]] = select <4 x i1> [[TMP13]], <4 x i32> [[TMP10]], <4 x i32> [[PREDPHI]] +; CHECK-NEXT: store <4 x i32> [[PREDPHI3]], ptr [[TMP5]], align 4, !alias.scope !0, !noalias !3 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] @@ -56,16 +54,16 @@ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[IF_END14:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[CMP3:%.*]] = icmp sgt i32 [[TMP15]], [[TMP16]] ; CHECK-NEXT: br i1 [[CMP3]], label [[IF_THEN:%.*]], label [[IF_END14]] ; CHECK: if.then: -; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP17]], 19 +; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[TMP15]], 19 ; CHECK-NEXT: br i1 [[CMP6]], label [[IF_END14]], label [[IF_ELSE:%.*]] ; CHECK: if.else: -; CHECK-NEXT: [[CMP10:%.*]] = icmp slt i32 [[TMP18]], 4 +; CHECK-NEXT: [[CMP10:%.*]] = icmp slt i32 [[TMP16]], 4 ; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP10]], i32 4, i32 5 ; CHECK-NEXT: br label [[IF_END14]] ; CHECK: if.end14: @@ -74,7 +72,7 @@ ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check.ll b/llvm/test/Transforms/LoopVectorize/runtime-check.ll --- a/llvm/test/Transforms/LoopVectorize/runtime-check.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check.ll @@ -18,32 +18,30 @@ ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg [[DBG4]] ; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]], !dbg [[DBG4]] ; CHECK: for.body.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1, !dbg [[DBG9:![0-9]+]] -; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64, !dbg [[DBG9]] -; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1, !dbg [[DBG9]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64, !dbg [[DBG9:![0-9]+]] ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4, !dbg [[DBG9]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]], !dbg [[DBG9]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[A1]], [[B2]], !dbg [[DBG9]] -; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], 16, !dbg [[DBG9]] +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[A1]], [[B2]], !dbg [[DBG9]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16, !dbg [[DBG9]] ; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]], !dbg [[DBG9]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -4, !dbg [[DBG9]] +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967292, !dbg [[DBG9]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]], !dbg [[DBG9]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], !dbg [[DBG9]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]], !dbg [[DBG9]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*, !dbg [[DBG9]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4, !dbg [[DBG9]] -; CHECK-NEXT: [[TMP6:%.*]] = fmul <4 x float> [[WIDE_LOAD]], , !dbg [[DBG9]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]], !dbg [[DBG9]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>*, !dbg [[DBG9]] -; CHECK-NEXT: store <4 x float> [[TMP6]], <4 x float>* [[TMP8]], align 4, !dbg [[DBG9]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]], !dbg [[DBG9]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*, !dbg [[DBG9]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4, !dbg [[DBG9]] +; CHECK-NEXT: [[TMP4:%.*]] = fmul <4 x float> [[WIDE_LOAD]], , !dbg [[DBG9]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]], !dbg [[DBG9]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>*, !dbg [[DBG9]] +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP6]], align 4, !dbg [[DBG9]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4, !dbg [[DBG9]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]], !dbg [[DBG9]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !dbg [[DBG9]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]], !dbg [[DBG9]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !dbg [[DBG9]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]], !dbg [[DBG9]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]], !dbg [[DBG9]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]], !dbg [[DBG9]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ], !dbg [[DBG9]] @@ -51,8 +49,8 @@ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], !dbg [[DBG9]] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]], !dbg [[DBG9]] -; CHECK-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !dbg [[DBG9]] -; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP10]], 3.000000e+00, !dbg [[DBG9]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX]], align 4, !dbg [[DBG9]] +; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP8]], 3.000000e+00, !dbg [[DBG9]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]], !dbg [[DBG9]] ; CHECK-NEXT: store float [[MUL]], float* [[ARRAYIDX2]], align 4, !dbg [[DBG9]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1, !dbg [[DBG9]]