diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -137,12 +137,18 @@ Context, {MDString::get(Context, "llvm.loop.isvectorized"), ConstantAsMetadata::get(ConstantInt::get(Context, APInt(32, 1)))}); + // LV will interleave/unroll the vector loop if profitable. There is no + // benefit from further runtime unrolling. + MDNode *DisableRuntimeUnrollingMD = MDNode::get( + Context, MDString::get(Context, "llvm.loop.unroll.runtime.disable")); + MDNode *LoopID = TheLoop->getLoopID(); - MDNode *NewLoopID = - makePostTransformationMetadata(Context, LoopID, - {Twine(Prefix(), "vectorize.").str(), - Twine(Prefix(), "interleave.").str()}, - {IsVectorizedMD}); + MDNode *NewLoopID = makePostTransformationMetadata( + Context, LoopID, + {Twine(Prefix(), "vectorize.").str(), + Twine(Prefix(), "interleave.").str(), + Twine(Prefix(), "unroll.runtime.").str()}, + {IsVectorizedMD, DisableRuntimeUnrollingMD}); TheLoop->setLoopID(NewLoopID); // Update internal cache. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll @@ -75,7 +75,7 @@ ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[I_08]] ; CHECK-NEXT: store double [[ADD]], double* [[ARRAYIDX1]], align 8 ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[I_08_IN]], 1 -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]] ; entry: %cmp7 = icmp sgt i64 %N, 0 @@ -131,7 +131,7 @@ ; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[TMP6]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64* [[TMP10]] to * -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP11]], align 8, !alias.scope !9 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP11]], align 8, !alias.scope !10 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP13:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() @@ -140,12 +140,12 @@ ; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP15]] to i64 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, i64* [[TMP12]], i64 [[TMP16]] ; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64* [[TMP17]] to * -; CHECK-NEXT: store [[TMP13]], * [[TMP18]], align 8, !alias.scope !12, !noalias !9 +; CHECK-NEXT: store [[TMP13]], * [[TMP18]], align 8, !alias.scope !13, !noalias !10 ; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP20:%.*]] = shl i64 [[TMP19]], 3 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP20]] ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -165,7 +165,7 @@ ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I_09]] ; CHECK-NEXT: store i64 [[ADD]], i64* [[ARRAYIDX2]], align 8 ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[I_09_IN]], 1 -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP16:![0-9]+]] ; entry: %cmp8 = icmp sgt i64 %N, 0 diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll @@ -40,7 +40,7 @@ ; CHECK-NEXT: store <16 x i8> [[TMP5]], <16 x i8>* [[TMP6]], align 1, !alias.scope !3, !noalias !0 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]] @@ -64,7 +64,7 @@ ; CHECK-NEXT: store i8 [[COND11]], i8* [[PDST_ADDR_020]], align 1 ; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_021]], -1 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0 -; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], [[LOOP7:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: while.end: ; CHECK-NEXT: ret void ; @@ -125,17 +125,17 @@ ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[PSRC]], i32 [[INDEX]] ; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i16, i16* [[PDST]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[NEXT_GEP]] to <8 x i16>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2, !alias.scope !8 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2, !alias.scope !9 ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <8 x i16> [[WIDE_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <8 x i16> [[WIDE_LOAD]], ; CHECK-NEXT: [[TMP3:%.*]] = sub <8 x i16> zeroinitializer, [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP2]], <8 x i16> , <8 x i16> [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[WIDE_LOAD]], <8 x i16> [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[NEXT_GEP10]] to <8 x i16>* -; CHECK-NEXT: store <8 x i16> [[TMP5]], <8 x i16>* [[TMP6]], align 2, !alias.scope !11, !noalias !8 +; CHECK-NEXT: store <8 x i16> [[TMP5]], <8 x i16>* [[TMP6]], align 2, !alias.scope !12, !noalias !9 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP13:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]] @@ -159,7 +159,7 @@ ; CHECK-NEXT: store i16 [[COND11]], i16* [[PDST_ADDR_021]], align 2 ; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_022]], -1 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0 -; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], [[LOOP14:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: while.end: ; CHECK-NEXT: ret void ; @@ -220,17 +220,17 @@ ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[PSRC]], i32 [[INDEX]] ; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i32, i32* [[PDST]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[NEXT_GEP]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, !alias.scope !15 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, !alias.scope !16 ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], ; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <4 x i32> zeroinitializer, [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> , <4 x i32> [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[NEXT_GEP10]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !alias.scope !18, !noalias !15 +; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !alias.scope !19, !noalias !16 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP20:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]] @@ -254,7 +254,7 @@ ; CHECK-NEXT: store i32 [[COND6]], i32* [[PDST_ADDR_015]], align 4 ; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_016]], -1 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0 -; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], [[LOOP21:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: while.end: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-loop-hint.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-loop-hint.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-loop-hint.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-loop-hint.ll @@ -70,12 +70,12 @@ br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !6 } -; CHECK: !0 = distinct !{!0, !1} -; CHECK-NEXT: !1 = !{!"llvm.loop.isvectorized", i32 1} -; CHECK-NEXT: !2 = distinct !{!2, !3, !1} -; CHECK-NEXT: !3 = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK-NEXT: !4 = distinct !{!4, !1} -; CHECK-NEXT: !5 = distinct !{!5, !3, !1} +; CHECK: !0 = distinct !{!0, [[IS_VEC:!.+]], [[NO_RT_UNROLL:!.+]]} +; CHECK-NEXT: [[IS_VEC]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-NEXT: [[NO_RT_UNROLL]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-NEXT: !3 = distinct !{!3, [[IS_VEC]], [[NO_RT_UNROLL]]} +; CHECK-NEXT: !4 = distinct !{!4, [[IS_VEC]], [[NO_RT_UNROLL]]} +; CHECK-NEXT: !5 = distinct !{!5, [[IS_VEC]], [[NO_RT_UNROLL]]} !6 = distinct !{!6, !7, !8} !7 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll @@ -36,7 +36,7 @@ ; CHECK-NEXT: store <16 x i8> [[WIDE_LOAD]], <16 x i8>* [[TMP6]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[SIZE]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]] @@ -52,7 +52,7 @@ ; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1 ; CHECK-NEXT: store i8 [[TMP8]], i8* [[BUFF]], align 1 ; CHECK-NEXT: [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0 -; CHECK-NEXT: br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop !2 +; CHECK-NEXT: br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: end: ; CHECK-NEXT: [[INCDEC_PTR_LCSSA:%.*]] = phi i8* [ [[INCDEC_PTR]], [[BODY]] ], [ [[IND_END2]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: store i8* [[INCDEC_PTR_LCSSA]], i8** [[POS]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/already-vectorized.ll b/llvm/test/Transforms/LoopVectorize/X86/already-vectorized.ll --- a/llvm/test/Transforms/LoopVectorize/X86/already-vectorized.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/already-vectorized.ll @@ -40,8 +40,8 @@ } ; Now, we check for the Hint metadata -; CHECK: [[vect]] = distinct !{[[vect]], [[width:![0-9]+]]} +; CHECK: [[vect]] = distinct !{[[vect]], [[width:![0-9]+]], [[runtime_unroll:![0-9]+]]} ; CHECK: [[width]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[scalar]] = distinct !{[[scalar]], [[runtime_unroll:![0-9]+]], [[width]]} ; CHECK: [[runtime_unroll]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[scalar]] = distinct !{[[scalar]], [[width]], [[runtime_unroll:![0-9]+]]} diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll --- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll @@ -28,119 +28,29 @@ ; AUTO_VEC-NEXT: [[CAST_CRD:%.*]] = sitofp i64 [[N_VEC]] to float ; AUTO_VEC-NEXT: [[TMP0:%.*]] = fmul fast float [[CAST_CRD]], 5.000000e-01 ; AUTO_VEC-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP0]], 1.000000e+00 -; AUTO_VEC-NEXT: [[TMP1:%.*]] = add nsw i64 [[N_VEC]], -32 -; AUTO_VEC-NEXT: [[TMP2:%.*]] = lshr exact i64 [[TMP1]], 5 -; AUTO_VEC-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 -; AUTO_VEC-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP3]], 3 -; AUTO_VEC-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP1]], 96 -; AUTO_VEC-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]] -; AUTO_VEC: vector.ph.new: -; AUTO_VEC-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP3]], 1152921504606846972 ; AUTO_VEC-NEXT: br label [[VECTOR_BODY:%.*]] ; AUTO_VEC: vector.body: -; AUTO_VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[VEC_IND:%.*]] = phi <8 x float> [ , [[VECTOR_PH_NEW]] ], [ [[VEC_IND_NEXT_3:%.*]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[NITER:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[NITER_NEXT_3:%.*]], [[VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[VEC_IND:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; AUTO_VEC-NEXT: [[STEP_ADD:%.*]] = fadd fast <8 x float> [[VEC_IND]], ; AUTO_VEC-NEXT: [[STEP_ADD2:%.*]] = fadd fast <8 x float> [[VEC_IND]], ; AUTO_VEC-NEXT: [[STEP_ADD3:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] +; AUTO_VEC-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] +; AUTO_VEC-NEXT: [[TMP2:%.*]] = bitcast float* [[TMP1]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[VEC_IND]], <8 x float>* [[TMP2]], align 4 +; AUTO_VEC-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; AUTO_VEC-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD]], <8 x float>* [[TMP4]], align 4 +; AUTO_VEC-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 16 ; AUTO_VEC-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[VEC_IND]], <8 x float>* [[TMP6]], align 4 -; AUTO_VEC-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP5]], i64 8 +; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD2]], <8 x float>* [[TMP6]], align 4 +; AUTO_VEC-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 24 ; AUTO_VEC-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD]], <8 x float>* [[TMP8]], align 4 -; AUTO_VEC-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP5]], i64 16 -; AUTO_VEC-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD2]], <8 x float>* [[TMP10]], align 4 -; AUTO_VEC-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP5]], i64 24 -; AUTO_VEC-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD3]], <8 x float>* [[TMP12]], align 4 -; AUTO_VEC-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 32 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD_1:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD2_1:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD3_1:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX_NEXT]] -; AUTO_VEC-NEXT: [[TMP14:%.*]] = bitcast float* [[TMP13]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[VEC_IND_NEXT]], <8 x float>* [[TMP14]], align 4 -; AUTO_VEC-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP13]], i64 8 -; AUTO_VEC-NEXT: [[TMP16:%.*]] = bitcast float* [[TMP15]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD_1]], <8 x float>* [[TMP16]], align 4 -; AUTO_VEC-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP13]], i64 16 -; AUTO_VEC-NEXT: [[TMP18:%.*]] = bitcast float* [[TMP17]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD2_1]], <8 x float>* [[TMP18]], align 4 -; AUTO_VEC-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP13]], i64 24 -; AUTO_VEC-NEXT: [[TMP20:%.*]] = bitcast float* [[TMP19]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD3_1]], <8 x float>* [[TMP20]], align 4 -; AUTO_VEC-NEXT: [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX]], 64 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT_1:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD_2:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD2_2:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD3_2:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX_NEXT_1]] -; AUTO_VEC-NEXT: [[TMP22:%.*]] = bitcast float* [[TMP21]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[VEC_IND_NEXT_1]], <8 x float>* [[TMP22]], align 4 -; AUTO_VEC-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[TMP21]], i64 8 -; AUTO_VEC-NEXT: [[TMP24:%.*]] = bitcast float* [[TMP23]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD_2]], <8 x float>* [[TMP24]], align 4 -; AUTO_VEC-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP21]], i64 16 -; AUTO_VEC-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP25]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD2_2]], <8 x float>* [[TMP26]], align 4 -; AUTO_VEC-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP21]], i64 24 -; AUTO_VEC-NEXT: [[TMP28:%.*]] = bitcast float* [[TMP27]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD3_2]], <8 x float>* [[TMP28]], align 4 -; AUTO_VEC-NEXT: [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX]], 96 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT_2:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD_3:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD2_3:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD3_3:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX_NEXT_2]] -; AUTO_VEC-NEXT: [[TMP30:%.*]] = bitcast float* [[TMP29]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[VEC_IND_NEXT_2]], <8 x float>* [[TMP30]], align 4 -; AUTO_VEC-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP29]], i64 8 -; AUTO_VEC-NEXT: [[TMP32:%.*]] = bitcast float* [[TMP31]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD_3]], <8 x float>* [[TMP32]], align 4 -; AUTO_VEC-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP29]], i64 16 -; AUTO_VEC-NEXT: [[TMP34:%.*]] = bitcast float* [[TMP33]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD2_3]], <8 x float>* [[TMP34]], align 4 -; AUTO_VEC-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, float* [[TMP29]], i64 24 -; AUTO_VEC-NEXT: [[TMP36:%.*]] = bitcast float* [[TMP35]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD3_3]], <8 x float>* [[TMP36]], align 4 -; AUTO_VEC-NEXT: [[INDEX_NEXT_3]] = add nuw i64 [[INDEX]], 128 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT_3]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[NITER_NEXT_3]] = add i64 [[NITER]], 4 -; AUTO_VEC-NEXT: [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NEXT_3]], [[UNROLL_ITER]] -; AUTO_VEC-NEXT: br i1 [[NITER_NCMP_3]], label [[MIDDLE_BLOCK_UNR_LCSSA]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; AUTO_VEC: middle.block.unr-lcssa: -; AUTO_VEC-NEXT: [[INDEX_UNR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_3]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[VEC_IND_UNR:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_3]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 -; AUTO_VEC-NEXT: br i1 [[LCMP_MOD_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY_EPIL:%.*]] -; AUTO_VEC: vector.body.epil: -; AUTO_VEC-NEXT: [[INDEX_EPIL:%.*]] = phi i64 [ [[INDEX_NEXT_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[INDEX_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; AUTO_VEC-NEXT: [[VEC_IND_EPIL:%.*]] = phi <8 x float> [ [[VEC_IND_NEXT_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[VEC_IND_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; AUTO_VEC-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ [[EPIL_ITER_NEXT:%.*]], [[VECTOR_BODY_EPIL]] ], [ 0, [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; AUTO_VEC-NEXT: [[STEP_ADD_EPIL:%.*]] = fadd fast <8 x float> [[VEC_IND_EPIL]], -; AUTO_VEC-NEXT: [[STEP_ADD2_EPIL:%.*]] = fadd fast <8 x float> [[VEC_IND_EPIL]], -; AUTO_VEC-NEXT: [[STEP_ADD3_EPIL:%.*]] = fadd fast <8 x float> [[VEC_IND_EPIL]], -; AUTO_VEC-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX_EPIL]] -; AUTO_VEC-NEXT: [[TMP38:%.*]] = bitcast float* [[TMP37]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[VEC_IND_EPIL]], <8 x float>* [[TMP38]], align 4 -; AUTO_VEC-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[TMP37]], i64 8 -; AUTO_VEC-NEXT: [[TMP40:%.*]] = bitcast float* [[TMP39]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD_EPIL]], <8 x float>* [[TMP40]], align 4 -; AUTO_VEC-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[TMP37]], i64 16 -; AUTO_VEC-NEXT: [[TMP42:%.*]] = bitcast float* [[TMP41]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD2_EPIL]], <8 x float>* [[TMP42]], align 4 -; AUTO_VEC-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, float* [[TMP37]], i64 24 -; AUTO_VEC-NEXT: [[TMP44:%.*]] = bitcast float* [[TMP43]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD3_EPIL]], <8 x float>* [[TMP44]], align 4 -; AUTO_VEC-NEXT: [[INDEX_NEXT_EPIL]] = add nuw i64 [[INDEX_EPIL]], 32 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT_EPIL]] = fadd fast <8 x float> [[VEC_IND_EPIL]], -; AUTO_VEC-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1 -; AUTO_VEC-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_NEXT]], [[XTRAITER]] -; AUTO_VEC-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY_EPIL]], !llvm.loop [[LOOP2:![0-9]+]] +; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD3]], <8 x float>* [[TMP8]], align 4 +; AUTO_VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; AUTO_VEC-NEXT: [[VEC_IND_NEXT]] = fadd fast <8 x float> [[VEC_IND]], +; AUTO_VEC-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AUTO_VEC-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AUTO_VEC: middle.block: ; AUTO_VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[ZEXT]] ; AUTO_VEC-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY]] @@ -151,8 +61,8 @@ ; AUTO_VEC-NEXT: store float [[X_06]], float* [[ARRAYIDX]], align 4 ; AUTO_VEC-NEXT: [[CONV1]] = fadd fast float [[X_06]], 5.000000e-01 ; AUTO_VEC-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; AUTO_VEC-NEXT: [[TMP45:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]] -; AUTO_VEC-NEXT: br i1 [[TMP45]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; AUTO_VEC-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]] +; AUTO_VEC-NEXT: br i1 [[TMP10]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; AUTO_VEC: for.end: ; AUTO_VEC-NEXT: ret void ; @@ -259,7 +169,7 @@ ; AUTO_VEC-NEXT: [[INDVARS_IV_NEXT_EPIL]] = add nuw nsw i64 [[INDVARS_IV_EPIL]], 1 ; AUTO_VEC-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1 ; AUTO_VEC-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_NEXT]], [[XTRAITER]] -; AUTO_VEC-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_END]], label [[FOR_BODY_EPIL]], !llvm.loop [[LOOP6:![0-9]+]] +; AUTO_VEC-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_END]], label [[FOR_BODY_EPIL]], !llvm.loop [[LOOP4:![0-9]+]] ; AUTO_VEC: for.end: ; AUTO_VEC-NEXT: ret void ; @@ -298,124 +208,34 @@ ; AUTO_VEC-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792 ; AUTO_VEC-NEXT: [[CAST_CRD:%.*]] = sitofp i64 [[N_VEC]] to double ; AUTO_VEC-NEXT: [[TMP0:%.*]] = fmul fast double [[CAST_CRD]], 3.000000e+00 -; AUTO_VEC-NEXT: [[TMP1:%.*]] = add nsw i64 [[N_VEC]], -16 -; AUTO_VEC-NEXT: [[TMP2:%.*]] = lshr exact i64 [[TMP1]], 4 -; AUTO_VEC-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 -; AUTO_VEC-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP3]], 3 -; AUTO_VEC-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP1]], 48 -; AUTO_VEC-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]] -; AUTO_VEC: vector.ph.new: -; AUTO_VEC-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP3]], 2305843009213693948 ; AUTO_VEC-NEXT: br label [[VECTOR_BODY:%.*]] ; AUTO_VEC: vector.body: -; AUTO_VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[VEC_IND:%.*]] = phi <4 x double> [ , [[VECTOR_PH_NEW]] ], [ [[VEC_IND_NEXT_3:%.*]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[NITER:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[NITER_NEXT_3:%.*]], [[VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[VEC_IND:%.*]] = phi <4 x double> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; AUTO_VEC-NEXT: [[STEP_ADD:%.*]] = fadd fast <4 x double> [[VEC_IND]], ; AUTO_VEC-NEXT: [[STEP_ADD2:%.*]] = fadd fast <4 x double> [[VEC_IND]], ; AUTO_VEC-NEXT: [[STEP_ADD3:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[TMP5:%.*]] = getelementptr double, double* [[A:%.*]], i64 [[INDEX]] +; AUTO_VEC-NEXT: [[TMP1:%.*]] = getelementptr double, double* [[A:%.*]], i64 [[INDEX]] +; AUTO_VEC-NEXT: [[TMP2:%.*]] = bitcast double* [[TMP1]] to <4 x double>* +; AUTO_VEC-NEXT: store <4 x double> [[VEC_IND]], <4 x double>* [[TMP2]], align 8 +; AUTO_VEC-NEXT: [[TMP3:%.*]] = getelementptr double, double* [[TMP1]], i64 4 +; AUTO_VEC-NEXT: [[TMP4:%.*]] = bitcast double* [[TMP3]] to <4 x double>* +; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD]], <4 x double>* [[TMP4]], align 8 +; AUTO_VEC-NEXT: [[TMP5:%.*]] = getelementptr double, double* [[TMP1]], i64 8 ; AUTO_VEC-NEXT: [[TMP6:%.*]] = bitcast double* [[TMP5]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[VEC_IND]], <4 x double>* [[TMP6]], align 8 -; AUTO_VEC-NEXT: [[TMP7:%.*]] = getelementptr double, double* [[TMP5]], i64 4 +; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD2]], <4 x double>* [[TMP6]], align 8 +; AUTO_VEC-NEXT: [[TMP7:%.*]] = getelementptr double, double* [[TMP1]], i64 12 ; AUTO_VEC-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD]], <4 x double>* [[TMP8]], align 8 -; AUTO_VEC-NEXT: [[TMP9:%.*]] = getelementptr double, double* [[TMP5]], i64 8 -; AUTO_VEC-NEXT: [[TMP10:%.*]] = bitcast double* [[TMP9]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD2]], <4 x double>* [[TMP10]], align 8 -; AUTO_VEC-NEXT: [[TMP11:%.*]] = getelementptr double, double* [[TMP5]], i64 12 -; AUTO_VEC-NEXT: [[TMP12:%.*]] = bitcast double* [[TMP11]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD3]], <4 x double>* [[TMP12]], align 8 -; AUTO_VEC-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 16 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD_1:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD2_1:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD3_1:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[A]], i64 [[INDEX_NEXT]] -; AUTO_VEC-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[VEC_IND_NEXT]], <4 x double>* [[TMP14]], align 8 -; AUTO_VEC-NEXT: [[TMP15:%.*]] = getelementptr double, double* [[TMP13]], i64 4 -; AUTO_VEC-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP15]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD_1]], <4 x double>* [[TMP16]], align 8 -; AUTO_VEC-NEXT: [[TMP17:%.*]] = getelementptr double, double* [[TMP13]], i64 8 -; AUTO_VEC-NEXT: [[TMP18:%.*]] = bitcast double* [[TMP17]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD2_1]], <4 x double>* [[TMP18]], align 8 -; AUTO_VEC-NEXT: [[TMP19:%.*]] = getelementptr double, double* [[TMP13]], i64 12 -; AUTO_VEC-NEXT: [[TMP20:%.*]] = bitcast double* [[TMP19]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD3_1]], <4 x double>* [[TMP20]], align 8 -; AUTO_VEC-NEXT: [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX]], 32 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT_1:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD_2:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD2_2:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD3_2:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[TMP21:%.*]] = getelementptr double, double* [[A]], i64 [[INDEX_NEXT_1]] -; AUTO_VEC-NEXT: [[TMP22:%.*]] = bitcast double* [[TMP21]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[VEC_IND_NEXT_1]], <4 x double>* [[TMP22]], align 8 -; AUTO_VEC-NEXT: [[TMP23:%.*]] = getelementptr double, double* [[TMP21]], i64 4 -; AUTO_VEC-NEXT: [[TMP24:%.*]] = bitcast double* [[TMP23]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD_2]], <4 x double>* [[TMP24]], align 8 -; AUTO_VEC-NEXT: [[TMP25:%.*]] = getelementptr double, double* [[TMP21]], i64 8 -; AUTO_VEC-NEXT: [[TMP26:%.*]] = bitcast double* [[TMP25]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD2_2]], <4 x double>* [[TMP26]], align 8 -; AUTO_VEC-NEXT: [[TMP27:%.*]] = getelementptr double, double* [[TMP21]], i64 12 -; AUTO_VEC-NEXT: [[TMP28:%.*]] = bitcast double* [[TMP27]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD3_2]], <4 x double>* [[TMP28]], align 8 -; AUTO_VEC-NEXT: [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX]], 48 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT_2:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD_3:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD2_3:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD3_3:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[TMP29:%.*]] = getelementptr double, double* [[A]], i64 [[INDEX_NEXT_2]] -; AUTO_VEC-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[VEC_IND_NEXT_2]], <4 x double>* [[TMP30]], align 8 -; AUTO_VEC-NEXT: [[TMP31:%.*]] = getelementptr double, double* [[TMP29]], i64 4 -; AUTO_VEC-NEXT: [[TMP32:%.*]] = bitcast double* [[TMP31]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD_3]], <4 x double>* [[TMP32]], align 8 -; AUTO_VEC-NEXT: [[TMP33:%.*]] = getelementptr double, double* [[TMP29]], i64 8 -; AUTO_VEC-NEXT: [[TMP34:%.*]] = bitcast double* [[TMP33]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD2_3]], <4 x double>* [[TMP34]], align 8 -; AUTO_VEC-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP29]], i64 12 -; AUTO_VEC-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD3_3]], <4 x double>* [[TMP36]], align 8 -; AUTO_VEC-NEXT: [[INDEX_NEXT_3]] = add nuw i64 [[INDEX]], 64 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT_3]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[NITER_NEXT_3]] = add i64 [[NITER]], 4 -; AUTO_VEC-NEXT: [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NEXT_3]], [[UNROLL_ITER]] -; AUTO_VEC-NEXT: br i1 [[NITER_NCMP_3]], label [[MIDDLE_BLOCK_UNR_LCSSA]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] -; AUTO_VEC: middle.block.unr-lcssa: -; AUTO_VEC-NEXT: [[INDEX_UNR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_3]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[VEC_IND_UNR:%.*]] = phi <4 x double> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_3]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 -; AUTO_VEC-NEXT: br i1 [[LCMP_MOD_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY_EPIL:%.*]] -; AUTO_VEC: vector.body.epil: -; AUTO_VEC-NEXT: [[INDEX_EPIL:%.*]] = phi i64 [ [[INDEX_NEXT_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[INDEX_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; AUTO_VEC-NEXT: [[VEC_IND_EPIL:%.*]] = phi <4 x double> [ [[VEC_IND_NEXT_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[VEC_IND_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; AUTO_VEC-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ [[EPIL_ITER_NEXT:%.*]], [[VECTOR_BODY_EPIL]] ], [ 0, [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; AUTO_VEC-NEXT: [[STEP_ADD_EPIL:%.*]] = fadd fast <4 x double> [[VEC_IND_EPIL]], -; AUTO_VEC-NEXT: [[STEP_ADD2_EPIL:%.*]] = fadd fast <4 x double> [[VEC_IND_EPIL]], -; AUTO_VEC-NEXT: [[STEP_ADD3_EPIL:%.*]] = fadd fast <4 x double> [[VEC_IND_EPIL]], -; AUTO_VEC-NEXT: [[TMP37:%.*]] = getelementptr double, double* [[A]], i64 [[INDEX_EPIL]] -; AUTO_VEC-NEXT: [[TMP38:%.*]] = bitcast double* [[TMP37]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[VEC_IND_EPIL]], <4 x double>* [[TMP38]], align 8 -; AUTO_VEC-NEXT: [[TMP39:%.*]] = getelementptr double, double* [[TMP37]], i64 4 -; AUTO_VEC-NEXT: [[TMP40:%.*]] = bitcast double* [[TMP39]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD_EPIL]], <4 x double>* [[TMP40]], align 8 -; AUTO_VEC-NEXT: [[TMP41:%.*]] = getelementptr double, double* [[TMP37]], i64 8 -; AUTO_VEC-NEXT: [[TMP42:%.*]] = bitcast double* [[TMP41]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD2_EPIL]], <4 x double>* [[TMP42]], align 8 -; AUTO_VEC-NEXT: [[TMP43:%.*]] = getelementptr double, double* [[TMP37]], i64 12 -; AUTO_VEC-NEXT: [[TMP44:%.*]] = bitcast double* [[TMP43]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD3_EPIL]], <4 x double>* [[TMP44]], align 8 -; AUTO_VEC-NEXT: [[INDEX_NEXT_EPIL]] = add nuw i64 [[INDEX_EPIL]], 16 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT_EPIL]] = fadd fast <4 x double> [[VEC_IND_EPIL]], -; AUTO_VEC-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1 -; AUTO_VEC-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_NEXT]], [[XTRAITER]] -; AUTO_VEC-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY_EPIL]], !llvm.loop [[LOOP8:![0-9]+]] +; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD3]], <4 x double>* [[TMP8]], align 8 +; AUTO_VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; AUTO_VEC-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x double> [[VEC_IND]], +; AUTO_VEC-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AUTO_VEC-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; AUTO_VEC: middle.block: ; AUTO_VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] -; AUTO_VEC-NEXT: [[TMP45:%.*]] = add nsw i64 [[N_VEC]], -1 -; AUTO_VEC-NEXT: [[CAST_CMO:%.*]] = sitofp i64 [[TMP45]] to double -; AUTO_VEC-NEXT: [[TMP46:%.*]] = fmul fast double [[CAST_CMO]], 3.000000e+00 +; AUTO_VEC-NEXT: [[TMP10:%.*]] = add nsw i64 [[N_VEC]], -1 +; AUTO_VEC-NEXT: [[CAST_CMO:%.*]] = sitofp i64 [[TMP10]] to double +; AUTO_VEC-NEXT: [[TMP11:%.*]] = fmul fast double [[CAST_CMO]], 3.000000e+00 ; AUTO_VEC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; AUTO_VEC: for.body: ; AUTO_VEC-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] @@ -425,9 +245,9 @@ ; AUTO_VEC-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; AUTO_VEC-NEXT: [[J_NEXT]] = fadd fast double [[J]], 3.000000e+00 ; AUTO_VEC-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_NEXT]], [[SMAX]] -; AUTO_VEC-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; AUTO_VEC-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; AUTO_VEC: for.end: -; AUTO_VEC-NEXT: [[J_LCSSA:%.*]] = phi double [ [[TMP46]], [[MIDDLE_BLOCK]] ], [ [[J]], [[FOR_BODY]] ] +; AUTO_VEC-NEXT: [[J_LCSSA:%.*]] = phi double [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ [[J]], [[FOR_BODY]] ] ; AUTO_VEC-NEXT: ret double [[J_LCSSA]] ; entry: @@ -514,7 +334,7 @@ ; AUTO_VEC-NEXT: [[J_NEXT_EPIL]] = fadd double [[J_EPIL]], 3.000000e+00 ; AUTO_VEC-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1 ; AUTO_VEC-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_NEXT]], [[XTRAITER]] -; AUTO_VEC-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_END]], label [[FOR_BODY_EPIL]], !llvm.loop [[LOOP10:![0-9]+]] +; AUTO_VEC-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_END]], label [[FOR_BODY_EPIL]], !llvm.loop [[LOOP8:![0-9]+]] ; AUTO_VEC: for.end: ; AUTO_VEC-NEXT: [[J_LCSSA:%.*]] = phi double [ [[J_LCSSA_PH]], [[FOR_END_UNR_LCSSA]] ], [ [[J_EPIL]], [[FOR_BODY_EPIL]] ] ; AUTO_VEC-NEXT: ret double [[J_LCSSA]] @@ -559,114 +379,41 @@ ; AUTO_VEC-NEXT: [[CAST_CRD:%.*]] = sitofp i64 [[N_VEC]] to float ; AUTO_VEC-NEXT: [[TMP1:%.*]] = fmul reassoc float [[CAST_CRD]], 4.200000e+01 ; AUTO_VEC-NEXT: [[IND_END:%.*]] = fadd reassoc float [[TMP1]], 1.000000e+00 -; AUTO_VEC-NEXT: [[TMP2:%.*]] = add nsw i64 [[N_VEC]], -32 -; AUTO_VEC-NEXT: [[TMP3:%.*]] = lshr exact i64 [[TMP2]], 5 -; AUTO_VEC-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1 -; AUTO_VEC-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP4]], 1 -; AUTO_VEC-NEXT: [[TMP5:%.*]] = icmp eq i64 [[TMP2]], 0 -; AUTO_VEC-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]] -; AUTO_VEC: vector.ph.new: -; AUTO_VEC-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP4]], 1152921504606846974 ; AUTO_VEC-NEXT: br label [[VECTOR_BODY:%.*]] ; AUTO_VEC: vector.body: -; AUTO_VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[VEC_IND:%.*]] = phi <8 x float> [ , [[VECTOR_PH_NEW]] ], [ [[VEC_IND_NEXT_1:%.*]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[NITER:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[VEC_IND:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; AUTO_VEC-NEXT: [[STEP_ADD:%.*]] = fadd reassoc <8 x float> [[VEC_IND]], ; AUTO_VEC-NEXT: [[STEP_ADD2:%.*]] = fadd reassoc <8 x float> [[STEP_ADD]], ; AUTO_VEC-NEXT: [[STEP_ADD3:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2]], -; AUTO_VEC-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 [[INDEX]] +; AUTO_VEC-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 [[INDEX]] +; AUTO_VEC-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* +; AUTO_VEC-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4 +; AUTO_VEC-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 8 +; AUTO_VEC-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <8 x float>* +; AUTO_VEC-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>* [[TMP5]], align 4 +; AUTO_VEC-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 16 ; AUTO_VEC-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP7]], align 4 -; AUTO_VEC-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i64 8 +; AUTO_VEC-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>* [[TMP7]], align 4 +; AUTO_VEC-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 24 ; AUTO_VEC-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4 -; AUTO_VEC-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i64 16 -; AUTO_VEC-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4 -; AUTO_VEC-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP6]], i64 24 -; AUTO_VEC-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4 -; AUTO_VEC-NEXT: [[TMP14:%.*]] = fadd reassoc <8 x float> [[VEC_IND]], [[WIDE_LOAD]] -; AUTO_VEC-NEXT: [[TMP15:%.*]] = fadd reassoc <8 x float> [[STEP_ADD]], [[WIDE_LOAD5]] -; AUTO_VEC-NEXT: [[TMP16:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2]], [[WIDE_LOAD6]] -; AUTO_VEC-NEXT: [[TMP17:%.*]] = fadd reassoc <8 x float> [[STEP_ADD3]], [[WIDE_LOAD7]] -; AUTO_VEC-NEXT: [[TMP18:%.*]] = bitcast float* [[TMP6]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP14]], <8 x float>* [[TMP18]], align 4 -; AUTO_VEC-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP8]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP15]], <8 x float>* [[TMP19]], align 4 -; AUTO_VEC-NEXT: [[TMP20:%.*]] = bitcast float* [[TMP10]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP16]], <8 x float>* [[TMP20]], align 4 -; AUTO_VEC-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP12]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP17]], <8 x float>* [[TMP21]], align 4 -; AUTO_VEC-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 32 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT:%.*]] = fadd reassoc <8 x float> [[STEP_ADD3]], -; AUTO_VEC-NEXT: [[STEP_ADD_1:%.*]] = fadd reassoc <8 x float> [[VEC_IND_NEXT]], -; AUTO_VEC-NEXT: [[STEP_ADD2_1:%.*]] = fadd reassoc <8 x float> [[STEP_ADD_1]], -; AUTO_VEC-NEXT: [[STEP_ADD3_1:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2_1]], -; AUTO_VEC-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[P]], i64 [[INDEX_NEXT]] -; AUTO_VEC-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP22]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD_1:%.*]] = load <8 x float>, <8 x float>* [[TMP23]], align 4 -; AUTO_VEC-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP22]], i64 8 -; AUTO_VEC-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD5_1:%.*]] = load <8 x float>, <8 x float>* [[TMP25]], align 4 -; AUTO_VEC-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP22]], i64 16 -; AUTO_VEC-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD6_1:%.*]] = load <8 x float>, <8 x float>* [[TMP27]], align 4 -; AUTO_VEC-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP22]], i64 24 -; AUTO_VEC-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD7_1:%.*]] = load <8 x float>, <8 x float>* [[TMP29]], align 4 -; AUTO_VEC-NEXT: [[TMP30:%.*]] = fadd reassoc <8 x float> [[VEC_IND_NEXT]], [[WIDE_LOAD_1]] -; AUTO_VEC-NEXT: [[TMP31:%.*]] = fadd reassoc <8 x float> [[STEP_ADD_1]], [[WIDE_LOAD5_1]] -; AUTO_VEC-NEXT: [[TMP32:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2_1]], [[WIDE_LOAD6_1]] -; AUTO_VEC-NEXT: [[TMP33:%.*]] = fadd reassoc <8 x float> [[STEP_ADD3_1]], [[WIDE_LOAD7_1]] -; AUTO_VEC-NEXT: [[TMP34:%.*]] = bitcast float* [[TMP22]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP30]], <8 x float>* [[TMP34]], align 4 -; AUTO_VEC-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP24]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP31]], <8 x float>* [[TMP35]], align 4 -; AUTO_VEC-NEXT: [[TMP36:%.*]] = bitcast float* [[TMP26]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP32]], <8 x float>* [[TMP36]], align 4 -; AUTO_VEC-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP28]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP33]], <8 x float>* [[TMP37]], align 4 -; AUTO_VEC-NEXT: [[INDEX_NEXT_1]] = add nuw i64 [[INDEX]], 64 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT_1]] = fadd reassoc <8 x float> [[STEP_ADD3_1]], -; AUTO_VEC-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2 -; AUTO_VEC-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]] -; AUTO_VEC-NEXT: br i1 [[NITER_NCMP_1]], label [[MIDDLE_BLOCK_UNR_LCSSA]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] -; AUTO_VEC: middle.block.unr-lcssa: -; AUTO_VEC-NEXT: [[INDEX_UNR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_1]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[VEC_IND_UNR:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_1]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 -; AUTO_VEC-NEXT: br i1 [[LCMP_MOD_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY_EPIL:%.*]] -; AUTO_VEC: vector.body.epil: -; AUTO_VEC-NEXT: [[STEP_ADD_EPIL:%.*]] = fadd reassoc <8 x float> [[VEC_IND_UNR]], -; AUTO_VEC-NEXT: [[STEP_ADD2_EPIL:%.*]] = fadd reassoc <8 x float> [[STEP_ADD_EPIL]], -; AUTO_VEC-NEXT: [[STEP_ADD3_EPIL:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2_EPIL]], -; AUTO_VEC-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[P]], i64 [[INDEX_UNR]] -; AUTO_VEC-NEXT: [[TMP39:%.*]] = bitcast float* [[TMP38]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD_EPIL:%.*]] = load <8 x float>, <8 x float>* [[TMP39]], align 4 -; AUTO_VEC-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, float* [[TMP38]], i64 8 -; AUTO_VEC-NEXT: [[TMP41:%.*]] = bitcast float* [[TMP40]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD5_EPIL:%.*]] = load <8 x float>, <8 x float>* [[TMP41]], align 4 -; AUTO_VEC-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, float* [[TMP38]], i64 16 -; AUTO_VEC-NEXT: [[TMP43:%.*]] = bitcast float* [[TMP42]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD6_EPIL:%.*]] = load <8 x float>, <8 x float>* [[TMP43]], align 4 -; AUTO_VEC-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[TMP38]], i64 24 -; AUTO_VEC-NEXT: [[TMP45:%.*]] = bitcast float* [[TMP44]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD7_EPIL:%.*]] = load <8 x float>, <8 x float>* [[TMP45]], align 4 -; AUTO_VEC-NEXT: [[TMP46:%.*]] = fadd reassoc <8 x float> [[VEC_IND_UNR]], [[WIDE_LOAD_EPIL]] -; AUTO_VEC-NEXT: [[TMP47:%.*]] = fadd reassoc <8 x float> [[STEP_ADD_EPIL]], [[WIDE_LOAD5_EPIL]] -; AUTO_VEC-NEXT: [[TMP48:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2_EPIL]], [[WIDE_LOAD6_EPIL]] -; AUTO_VEC-NEXT: [[TMP49:%.*]] = fadd reassoc <8 x float> [[STEP_ADD3_EPIL]], [[WIDE_LOAD7_EPIL]] -; AUTO_VEC-NEXT: [[TMP50:%.*]] = bitcast float* [[TMP38]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP46]], <8 x float>* [[TMP50]], align 4 -; AUTO_VEC-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP40]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP47]], <8 x float>* [[TMP51]], align 4 -; AUTO_VEC-NEXT: [[TMP52:%.*]] = bitcast float* [[TMP42]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP48]], <8 x float>* [[TMP52]], align 4 -; AUTO_VEC-NEXT: [[TMP53:%.*]] = bitcast float* [[TMP44]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP49]], <8 x float>* [[TMP53]], align 4 -; AUTO_VEC-NEXT: br label [[MIDDLE_BLOCK]] +; AUTO_VEC-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4 +; AUTO_VEC-NEXT: [[TMP10:%.*]] = fadd reassoc <8 x float> [[VEC_IND]], [[WIDE_LOAD]] +; AUTO_VEC-NEXT: [[TMP11:%.*]] = fadd reassoc <8 x float> [[STEP_ADD]], [[WIDE_LOAD5]] +; AUTO_VEC-NEXT: [[TMP12:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2]], [[WIDE_LOAD6]] +; AUTO_VEC-NEXT: [[TMP13:%.*]] = fadd reassoc <8 x float> [[STEP_ADD3]], [[WIDE_LOAD7]] +; AUTO_VEC-NEXT: [[TMP14:%.*]] = bitcast float* [[TMP2]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[TMP10]], <8 x float>* [[TMP14]], align 4 +; AUTO_VEC-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP4]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[TMP11]], <8 x float>* [[TMP15]], align 4 +; AUTO_VEC-NEXT: [[TMP16:%.*]] = bitcast float* [[TMP6]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[TMP12]], <8 x float>* [[TMP16]], align 4 +; AUTO_VEC-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP8]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[TMP13]], <8 x float>* [[TMP17]], align 4 +; AUTO_VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; AUTO_VEC-NEXT: [[VEC_IND_NEXT]] = fadd reassoc <8 x float> [[STEP_ADD3]], +; AUTO_VEC-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AUTO_VEC-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; AUTO_VEC: middle.block: ; AUTO_VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; AUTO_VEC-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] @@ -676,13 +423,13 @@ ; AUTO_VEC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] ; AUTO_VEC-NEXT: [[X_012:%.*]] = phi float [ [[ADD3:%.*]], [[FOR_BODY]] ], [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] ; AUTO_VEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[P]], i64 [[INDVARS_IV]] -; AUTO_VEC-NEXT: [[TMP54:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; AUTO_VEC-NEXT: [[ADD:%.*]] = fadd reassoc float [[X_012]], [[TMP54]] +; AUTO_VEC-NEXT: [[TMP19:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; AUTO_VEC-NEXT: [[ADD:%.*]] = fadd reassoc float [[X_012]], [[TMP19]] ; AUTO_VEC-NEXT: store float [[ADD]], float* [[ARRAYIDX]], align 4 ; AUTO_VEC-NEXT: [[ADD3]] = fadd reassoc float [[X_012]], 4.200000e+01 ; AUTO_VEC-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AUTO_VEC-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[TMP0]] -; AUTO_VEC-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; AUTO_VEC-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; entry: %cmp.not11 = icmp eq i32 %N, 0 diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll --- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll @@ -398,7 +398,7 @@ ; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 2 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 -; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; FVW2: for.end: ; FVW2-NEXT: ret void ; @@ -643,7 +643,7 @@ ; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 -; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; FVW2: for.end: ; FVW2-NEXT: ret void ; @@ -874,7 +874,7 @@ ; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 2 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 -; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; FVW2: for.end: ; FVW2-NEXT: ret void ; @@ -1105,7 +1105,7 @@ ; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 2 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 -; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; FVW2: for.end: ; FVW2-NEXT: ret void ; @@ -1336,7 +1336,7 @@ ; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 2 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 -; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; FVW2: for.end: ; FVW2-NEXT: ret void ; @@ -1438,142 +1438,24 @@ ; AVX512-NEXT: [[IND_END:%.*]] = getelementptr float, float* [[PTR]], i64 [[N_VEC]] ; AVX512-NEXT: [[TMP12:%.*]] = shl i64 [[N_VEC]], 4 ; AVX512-NEXT: [[IND_END14:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP12]] -; AVX512-NEXT: [[TMP13:%.*]] = add nsw i64 [[N_VEC]], -16 -; AVX512-NEXT: [[TMP14:%.*]] = lshr exact i64 [[TMP13]], 4 -; AVX512-NEXT: [[TMP15:%.*]] = add nuw nsw i64 [[TMP14]], 1 -; AVX512-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP15]], 7 -; AVX512-NEXT: [[TMP16:%.*]] = icmp ult i64 [[TMP13]], 112 -; AVX512-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]] -; AVX512: vector.ph.new: -; AVX512-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP15]], 2305843009213693944 ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: -; AVX512-NEXT: [[POINTER_PHI:%.*]] = phi float* [ [[DEST]], [[VECTOR_PH_NEW]] ], [ [[PTR_IND_7:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_7:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[NITER:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[NITER_NEXT_7:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[POINTER_PHI:%.*]] = phi float* [ [[DEST]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[NEXT_GEP:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX]] -; AVX512-NEXT: [[TMP17:%.*]] = getelementptr float, float* [[POINTER_PHI]], <16 x i64> -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP]], i64 [[IDXPROM]] -; AVX512-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x float>, <16 x float>* [[TMP19]], align 4, !alias.scope !2 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD]], <16 x float*> [[TMP17]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP20:%.*]] = bitcast float* [[NEXT_GEP]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x float>, <16 x float>* [[TMP20]], align 4, !alias.scope !9 -; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP17]], i64 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15]], <16 x float*> [[TMP21]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 16 -; AVX512-NEXT: [[PTR_IND:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 256 -; AVX512-NEXT: [[NEXT_GEP_1:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT]] -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr float, float* [[PTR_IND]], <16 x i64> -; AVX512-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_1]], i64 [[IDXPROM]] -; AVX512-NEXT: [[TMP24:%.*]] = bitcast float* [[TMP23]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD_1:%.*]] = load <16 x float>, <16 x float>* [[TMP24]], align 4, !alias.scope !2 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_1]], <16 x float*> [[TMP22]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP25:%.*]] = bitcast float* [[NEXT_GEP_1]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD15_1:%.*]] = load <16 x float>, <16 x float>* [[TMP25]], align 4, !alias.scope !9 -; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP22]], i64 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_1]], <16 x float*> [[TMP26]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX]], 32 -; AVX512-NEXT: [[PTR_IND_1:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 512 -; AVX512-NEXT: [[NEXT_GEP_2:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_1]] -; AVX512-NEXT: [[TMP27:%.*]] = getelementptr float, float* [[PTR_IND_1]], <16 x i64> -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_2]], i64 [[IDXPROM]] -; AVX512-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD_2:%.*]] = load <16 x float>, <16 x float>* [[TMP29]], align 4, !alias.scope !2 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_2]], <16 x float*> [[TMP27]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP30:%.*]] = bitcast float* [[NEXT_GEP_2]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD15_2:%.*]] = load <16 x float>, <16 x float>* [[TMP30]], align 4, !alias.scope !9 -; AVX512-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP27]], i64 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_2]], <16 x float*> [[TMP31]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX]], 48 -; AVX512-NEXT: [[PTR_IND_2:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 768 -; AVX512-NEXT: [[NEXT_GEP_3:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_2]] -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr float, float* [[PTR_IND_2]], <16 x i64> -; AVX512-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_3]], i64 [[IDXPROM]] -; AVX512-NEXT: [[TMP34:%.*]] = bitcast float* [[TMP33]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD_3:%.*]] = load <16 x float>, <16 x float>* [[TMP34]], align 4, !alias.scope !2 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_3]], <16 x float*> [[TMP32]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP35:%.*]] = bitcast float* [[NEXT_GEP_3]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD15_3:%.*]] = load <16 x float>, <16 x float>* [[TMP35]], align 4, !alias.scope !9 -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP32]], i64 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_3]], <16 x float*> [[TMP36]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[INDEX_NEXT_3:%.*]] = or i64 [[INDEX]], 64 -; AVX512-NEXT: [[PTR_IND_3:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 1024 -; AVX512-NEXT: [[NEXT_GEP_4:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_3]] -; AVX512-NEXT: [[TMP37:%.*]] = getelementptr float, float* [[PTR_IND_3]], <16 x i64> -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_4]], i64 [[IDXPROM]] -; AVX512-NEXT: [[TMP39:%.*]] = bitcast float* [[TMP38]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD_4:%.*]] = load <16 x float>, <16 x float>* [[TMP39]], align 4, !alias.scope !2 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_4]], <16 x float*> [[TMP37]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP40:%.*]] = bitcast float* [[NEXT_GEP_4]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD15_4:%.*]] = load <16 x float>, <16 x float>* [[TMP40]], align 4, !alias.scope !9 -; AVX512-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP37]], i64 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_4]], <16 x float*> [[TMP41]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[INDEX_NEXT_4:%.*]] = or i64 [[INDEX]], 80 -; AVX512-NEXT: [[PTR_IND_4:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 1280 -; AVX512-NEXT: [[NEXT_GEP_5:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_4]] -; AVX512-NEXT: [[TMP42:%.*]] = getelementptr float, float* [[PTR_IND_4]], <16 x i64> -; AVX512-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_5]], i64 [[IDXPROM]] -; AVX512-NEXT: [[TMP44:%.*]] = bitcast float* [[TMP43]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD_5:%.*]] = load <16 x float>, <16 x float>* [[TMP44]], align 4, !alias.scope !2 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_5]], <16 x float*> [[TMP42]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP45:%.*]] = bitcast float* [[NEXT_GEP_5]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD15_5:%.*]] = load <16 x float>, <16 x float>* [[TMP45]], align 4, !alias.scope !9 -; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP42]], i64 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_5]], <16 x float*> [[TMP46]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[INDEX_NEXT_5:%.*]] = or i64 [[INDEX]], 96 -; AVX512-NEXT: [[PTR_IND_5:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 1536 -; AVX512-NEXT: [[NEXT_GEP_6:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_5]] -; AVX512-NEXT: [[TMP47:%.*]] = getelementptr float, float* [[PTR_IND_5]], <16 x i64> -; AVX512-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_6]], i64 [[IDXPROM]] -; AVX512-NEXT: [[TMP49:%.*]] = bitcast float* [[TMP48]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD_6:%.*]] = load <16 x float>, <16 x float>* [[TMP49]], align 4, !alias.scope !2 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_6]], <16 x float*> [[TMP47]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP50:%.*]] = bitcast float* [[NEXT_GEP_6]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD15_6:%.*]] = load <16 x float>, <16 x float>* [[TMP50]], align 4, !alias.scope !9 -; AVX512-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP47]], i64 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_6]], <16 x float*> [[TMP51]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[INDEX_NEXT_6:%.*]] = or i64 [[INDEX]], 112 -; AVX512-NEXT: [[PTR_IND_6:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 1792 -; AVX512-NEXT: [[NEXT_GEP_7:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_6]] -; AVX512-NEXT: [[TMP52:%.*]] = getelementptr float, float* [[PTR_IND_6]], <16 x i64> -; AVX512-NEXT: [[TMP53:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_7]], i64 [[IDXPROM]] -; AVX512-NEXT: [[TMP54:%.*]] = bitcast float* [[TMP53]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD_7:%.*]] = load <16 x float>, <16 x float>* [[TMP54]], align 4, !alias.scope !2 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_7]], <16 x float*> [[TMP52]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP55:%.*]] = bitcast float* [[NEXT_GEP_7]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD15_7:%.*]] = load <16 x float>, <16 x float>* [[TMP55]], align 4, !alias.scope !9 -; AVX512-NEXT: [[TMP56:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP52]], i64 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_7]], <16 x float*> [[TMP56]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[INDEX_NEXT_7]] = add nuw i64 [[INDEX]], 128 -; AVX512-NEXT: [[PTR_IND_7]] = getelementptr float, float* [[POINTER_PHI]], i64 2048 -; AVX512-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8 -; AVX512-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]] -; AVX512-NEXT: br i1 [[NITER_NCMP_7]], label [[MIDDLE_BLOCK_UNR_LCSSA]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] -; AVX512: middle.block.unr-lcssa: -; AVX512-NEXT: [[POINTER_PHI_UNR:%.*]] = phi float* [ [[DEST]], [[VECTOR_PH]] ], [ [[PTR_IND_7]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[INDEX_UNR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_7]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 -; AVX512-NEXT: br i1 [[LCMP_MOD_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY_EPIL:%.*]] -; AVX512: vector.body.epil: -; AVX512-NEXT: [[POINTER_PHI_EPIL:%.*]] = phi float* [ [[PTR_IND_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[POINTER_PHI_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; AVX512-NEXT: [[INDEX_EPIL:%.*]] = phi i64 [ [[INDEX_NEXT_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[INDEX_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; AVX512-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ [[EPIL_ITER_NEXT:%.*]], [[VECTOR_BODY_EPIL]] ], [ 0, [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; AVX512-NEXT: [[NEXT_GEP_EPIL:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_EPIL]] -; AVX512-NEXT: [[TMP57:%.*]] = getelementptr float, float* [[POINTER_PHI_EPIL]], <16 x i64> -; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_EPIL]], i64 [[IDXPROM]] -; AVX512-NEXT: [[TMP59:%.*]] = bitcast float* [[TMP58]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD_EPIL:%.*]] = load <16 x float>, <16 x float>* [[TMP59]], align 4, !alias.scope !2 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_EPIL]], <16 x float*> [[TMP57]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP60:%.*]] = bitcast float* [[NEXT_GEP_EPIL]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD15_EPIL:%.*]] = load <16 x float>, <16 x float>* [[TMP60]], align 4, !alias.scope !9 -; AVX512-NEXT: [[TMP61:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP57]], i64 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_EPIL]], <16 x float*> [[TMP61]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[INDEX_NEXT_EPIL]] = add nuw i64 [[INDEX_EPIL]], 16 -; AVX512-NEXT: [[PTR_IND_EPIL]] = getelementptr float, float* [[POINTER_PHI_EPIL]], i64 256 -; AVX512-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1 -; AVX512-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_NEXT]], [[XTRAITER]] -; AVX512-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY_EPIL]], !llvm.loop [[LOOP11:![0-9]+]] +; AVX512-NEXT: [[TMP13:%.*]] = getelementptr float, float* [[POINTER_PHI]], <16 x i64> +; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP]], i64 [[IDXPROM]] +; AVX512-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <16 x float>* +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x float>, <16 x float>* [[TMP15]], align 4, !alias.scope !3 +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD]], <16 x float*> [[TMP13]], i32 4, <16 x i1> ), !alias.scope !6, !noalias !8 +; AVX512-NEXT: [[TMP16:%.*]] = bitcast float* [[NEXT_GEP]] to <16 x float>* +; AVX512-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x float>, <16 x float>* [[TMP16]], align 4, !alias.scope !10 +; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP13]], i64 1 +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15]], <16 x float*> [[TMP17]], i32 4, <16 x i1> ), !alias.scope !6, !noalias !8 +; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; AVX512-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX512-NEXT: [[PTR_IND]] = getelementptr float, float* [[POINTER_PHI]], i64 256 +; AVX512-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER]] @@ -1585,15 +1467,15 @@ ; AVX512-NEXT: [[PTR_ADDR_012:%.*]] = phi float* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[PTR_ADDR_012_PH]], [[FOR_BODY_PREHEADER]] ] ; AVX512-NEXT: [[DEST_ADDR_011:%.*]] = phi float* [ [[ADD_PTR6:%.*]], [[FOR_BODY]] ], [ [[DEST_ADDR_011_PH]], [[FOR_BODY_PREHEADER]] ] ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 [[IDXPROM]] -; AVX512-NEXT: [[TMP62:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; AVX512-NEXT: store float [[TMP62]], float* [[DEST_ADDR_011]], align 4 -; AVX512-NEXT: [[TMP63:%.*]] = load float, float* [[PTR_ADDR_012]], align 4 +; AVX512-NEXT: [[TMP19:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; AVX512-NEXT: store float [[TMP19]], float* [[DEST_ADDR_011]], align 4 +; AVX512-NEXT: [[TMP20:%.*]] = load float, float* [[PTR_ADDR_012]], align 4 ; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 1 -; AVX512-NEXT: store float [[TMP63]], float* [[ARRAYIDX5]], align 4 +; AVX512-NEXT: store float [[TMP20]], float* [[ARRAYIDX5]], align 4 ; AVX512-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 1 ; AVX512-NEXT: [[ADD_PTR6]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 16 ; AVX512-NEXT: [[CMP_NOT:%.*]] = icmp eq float* [[INCDEC_PTR]], [[ADD_PTR]] -; AVX512-NEXT: br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; AVX512-NEXT: br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -1655,38 +1537,38 @@ ; FVW2-NEXT: [[NEXT_GEP19:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP19]] ; FVW2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP]], i64 [[IDXPROM]] ; FVW2-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP21]], align 4, !alias.scope !7 +; FVW2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP21]], align 4, !alias.scope !8 ; FVW2-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP20]], i64 2 ; FVW2-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP22]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD20:%.*]] = load <2 x float>, <2 x float>* [[TMP23]], align 4, !alias.scope !7 +; FVW2-NEXT: [[WIDE_LOAD20:%.*]] = load <2 x float>, <2 x float>* [[TMP23]], align 4, !alias.scope !8 ; FVW2-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0 -; FVW2-NEXT: store float [[TMP24]], float* [[NEXT_GEP16]], align 4, !alias.scope !10, !noalias !12 +; FVW2-NEXT: store float [[TMP24]], float* [[NEXT_GEP16]], align 4, !alias.scope !11, !noalias !13 ; FVW2-NEXT: [[TMP25:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1 -; FVW2-NEXT: store float [[TMP25]], float* [[NEXT_GEP17]], align 4, !alias.scope !10, !noalias !12 +; FVW2-NEXT: store float [[TMP25]], float* [[NEXT_GEP17]], align 4, !alias.scope !11, !noalias !13 ; FVW2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[WIDE_LOAD20]], i32 0 -; FVW2-NEXT: store float [[TMP26]], float* [[NEXT_GEP18]], align 4, !alias.scope !10, !noalias !12 +; FVW2-NEXT: store float [[TMP26]], float* [[NEXT_GEP18]], align 4, !alias.scope !11, !noalias !13 ; FVW2-NEXT: [[TMP27:%.*]] = extractelement <2 x float> [[WIDE_LOAD20]], i32 1 -; FVW2-NEXT: store float [[TMP27]], float* [[NEXT_GEP19]], align 4, !alias.scope !10, !noalias !12 +; FVW2-NEXT: store float [[TMP27]], float* [[NEXT_GEP19]], align 4, !alias.scope !11, !noalias !13 ; FVW2-NEXT: [[TMP28:%.*]] = bitcast float* [[NEXT_GEP]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD21:%.*]] = load <2 x float>, <2 x float>* [[TMP28]], align 4, !alias.scope !14 +; FVW2-NEXT: [[WIDE_LOAD21:%.*]] = load <2 x float>, <2 x float>* [[TMP28]], align 4, !alias.scope !15 ; FVW2-NEXT: [[TMP29:%.*]] = getelementptr float, float* [[NEXT_GEP]], i64 2 ; FVW2-NEXT: [[TMP30:%.*]] = bitcast float* [[TMP29]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD22:%.*]] = load <2 x float>, <2 x float>* [[TMP30]], align 4, !alias.scope !14 +; FVW2-NEXT: [[WIDE_LOAD22:%.*]] = load <2 x float>, <2 x float>* [[TMP30]], align 4, !alias.scope !15 ; FVW2-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP16]], i64 1 ; FVW2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP17]], i64 1 ; FVW2-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP18]], i64 1 ; FVW2-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP19]], i64 1 ; FVW2-NEXT: [[TMP35:%.*]] = extractelement <2 x float> [[WIDE_LOAD21]], i32 0 -; FVW2-NEXT: store float [[TMP35]], float* [[TMP31]], align 4, !alias.scope !10, !noalias !12 +; FVW2-NEXT: store float [[TMP35]], float* [[TMP31]], align 4, !alias.scope !11, !noalias !13 ; FVW2-NEXT: [[TMP36:%.*]] = extractelement <2 x float> [[WIDE_LOAD21]], i32 1 -; FVW2-NEXT: store float [[TMP36]], float* [[TMP32]], align 4, !alias.scope !10, !noalias !12 +; FVW2-NEXT: store float [[TMP36]], float* [[TMP32]], align 4, !alias.scope !11, !noalias !13 ; FVW2-NEXT: [[TMP37:%.*]] = extractelement <2 x float> [[WIDE_LOAD22]], i32 0 -; FVW2-NEXT: store float [[TMP37]], float* [[TMP33]], align 4, !alias.scope !10, !noalias !12 +; FVW2-NEXT: store float [[TMP37]], float* [[TMP33]], align 4, !alias.scope !11, !noalias !13 ; FVW2-NEXT: [[TMP38:%.*]] = extractelement <2 x float> [[WIDE_LOAD22]], i32 1 -; FVW2-NEXT: store float [[TMP38]], float* [[TMP34]], align 4, !alias.scope !10, !noalias !12 +; FVW2-NEXT: store float [[TMP38]], float* [[TMP34]], align 4, !alias.scope !11, !noalias !13 ; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; FVW2-NEXT: [[TMP39:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; FVW2-NEXT: br i1 [[TMP39]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; FVW2-NEXT: br i1 [[TMP39]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; FVW2: middle.block: ; FVW2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; FVW2-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER]] @@ -1706,7 +1588,7 @@ ; FVW2-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 1 ; FVW2-NEXT: [[ADD_PTR6]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 16 ; FVW2-NEXT: [[CMP_NOT:%.*]] = icmp eq float* [[INCDEC_PTR]], [[ADD_PTR]] -; FVW2-NEXT: br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; FVW2-NEXT: br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; FVW2: for.end: ; FVW2-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll @@ -71,7 +71,7 @@ ; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: [[T4:%.*]] = phi i32 [ [[T3]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[T4]] @@ -129,14 +129,14 @@ ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 8, !alias.scope !8, !noalias !11 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 8, !alias.scope !9, !noalias !12 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>* -; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT9]], <16 x i32>* [[TMP3]], align 4, !alias.scope !8, !noalias !11 -; CHECK-NEXT: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[BROADCAST_SPLAT9]], <16 x i32*> [[BROADCAST_SPLAT11]], i32 4, <16 x i1> [[TMP2]]), !alias.scope !11 +; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT9]], <16 x i32>* [[TMP3]], align 4, !alias.scope !9, !noalias !12 +; CHECK-NEXT: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[BROADCAST_SPLAT9]], <16 x i32*> [[BROADCAST_SPLAT11]], i32 4, <16 x i1> [[TMP2]]), !alias.scope !12 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -166,7 +166,7 @@ ; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> [[BROADCAST_SPLAT22]], <8 x i32*> [[BROADCAST_SPLAT24]], i32 4, <8 x i1> [[TMP7]]) ; CHECK-NEXT: [[INDEX_NEXT16]] = add nuw i64 [[INDEX15]], 8 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT16]], [[N_VEC14]] -; CHECK-NEXT: br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[CMP_N17:%.*]] = icmp eq i64 [[SMAX12]], [[N_VEC14]] ; CHECK-NEXT: br i1 [[CMP_N17]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll --- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -83,7 +83,7 @@ ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; AVX1: for.end: ; AVX1-NEXT: ret void ; @@ -197,7 +197,7 @@ ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -318,7 +318,7 @@ ; AVX512-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP57]], <8 x i32>* [[TMP60]], i32 4, <8 x i1> [[TMP53]]) ; AVX512-NEXT: [[INDEX_NEXT19]] = add nuw i64 [[INDEX18]], 8 ; AVX512-NEXT: [[TMP61:%.*]] = icmp eq i64 [[INDEX_NEXT19]], 10000 -; AVX512-NEXT: br i1 [[TMP61]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; AVX512-NEXT: br i1 [[TMP61]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; AVX512: vec.epilog.middle.block: ; AVX512-NEXT: [[CMP_N20:%.*]] = icmp eq i64 10000, 10000 ; AVX512-NEXT: br i1 [[CMP_N20]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] @@ -406,20 +406,20 @@ ; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP1]], i32 0 ; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32 addrspace(1)* [[TMP2]] to <8 x i32> addrspace(1)* -; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP3]], align 4, !alias.scope !11 +; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP3]], align 4, !alias.scope !12 ; AVX1-NEXT: [[TMP4:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], ; AVX1-NEXT: [[TMP5:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP5]], i32 0 ; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <8 x i32> addrspace(1)* -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison), !alias.scope !14 +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison), !alias.scope !15 ; AVX1-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] ; AVX1-NEXT: [[TMP9:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP10:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP9]], i32 0 ; AVX1-NEXT: [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <8 x i32> addrspace(1)* -; AVX1-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP8]], <8 x i32> addrspace(1)* [[TMP11]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !16, !noalias !18 +; AVX1-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP8]], <8 x i32> addrspace(1)* [[TMP11]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !17, !noalias !19 ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; AVX1-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; AVX1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; AVX1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -442,7 +442,7 @@ ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; AVX1: for.end: ; AVX1-NEXT: ret void ; @@ -481,16 +481,16 @@ ; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 0 ; AVX2-NEXT: [[TMP9:%.*]] = bitcast i32 addrspace(1)* [[TMP8]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP9]], align 4, !alias.scope !11 +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP9]], align 4, !alias.scope !12 ; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 8 ; AVX2-NEXT: [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP11]], align 4, !alias.scope !11 +; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP11]], align 4, !alias.scope !12 ; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 16 ; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP13]], align 4, !alias.scope !11 +; AVX2-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP13]], align 4, !alias.scope !12 ; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 24 ; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP15]], align 4, !alias.scope !11 +; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP15]], align 4, !alias.scope !12 ; AVX2-NEXT: [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], ; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], ; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], @@ -501,16 +501,16 @@ ; AVX2-NEXT: [[TMP23:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP24:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 0 ; AVX2-NEXT: [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x i32> poison), !alias.scope !14 +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x i32> poison), !alias.scope !15 ; AVX2-NEXT: [[TMP26:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 8 ; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x i32> poison), !alias.scope !14 +; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x i32> poison), !alias.scope !15 ; AVX2-NEXT: [[TMP28:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 16 ; AVX2-NEXT: [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison), !alias.scope !14 +; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison), !alias.scope !15 ; AVX2-NEXT: [[TMP30:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 24 ; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison), !alias.scope !14 +; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison), !alias.scope !15 ; AVX2-NEXT: [[TMP32:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] ; AVX2-NEXT: [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] ; AVX2-NEXT: [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] @@ -521,19 +521,19 @@ ; AVX2-NEXT: [[TMP39:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP40:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 0 ; AVX2-NEXT: [[TMP41:%.*]] = bitcast i32 addrspace(1)* [[TMP40]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP32]], <8 x i32> addrspace(1)* [[TMP41]], i32 4, <8 x i1> [[TMP16]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP32]], <8 x i32> addrspace(1)* [[TMP41]], i32 4, <8 x i1> [[TMP16]]), !alias.scope !17, !noalias !19 ; AVX2-NEXT: [[TMP42:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 8 ; AVX2-NEXT: [[TMP43:%.*]] = bitcast i32 addrspace(1)* [[TMP42]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP33]], <8 x i32> addrspace(1)* [[TMP43]], i32 4, <8 x i1> [[TMP17]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP33]], <8 x i32> addrspace(1)* [[TMP43]], i32 4, <8 x i1> [[TMP17]]), !alias.scope !17, !noalias !19 ; AVX2-NEXT: [[TMP44:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 16 ; AVX2-NEXT: [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP34]], <8 x i32> addrspace(1)* [[TMP45]], i32 4, <8 x i1> [[TMP18]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP34]], <8 x i32> addrspace(1)* [[TMP45]], i32 4, <8 x i1> [[TMP18]]), !alias.scope !17, !noalias !19 ; AVX2-NEXT: [[TMP46:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 24 ; AVX2-NEXT: [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP35]], <8 x i32> addrspace(1)* [[TMP47]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP35]], <8 x i32> addrspace(1)* [[TMP47]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !17, !noalias !19 ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX2-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; AVX2-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -556,7 +556,7 @@ ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -774,21 +774,21 @@ ; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 ; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>* -; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !21 +; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !22 ; AVX1-NEXT: [[TMP4:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], ; AVX1-NEXT: [[TMP5:%.*]] = getelementptr float, float* [[B]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP6:%.*]] = getelementptr float, float* [[TMP5]], i32 0 ; AVX1-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x float> poison), !alias.scope !24 +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x float> poison), !alias.scope !25 ; AVX1-NEXT: [[TMP8:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float> ; AVX1-NEXT: [[TMP9:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP8]] ; AVX1-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[A]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[TMP10]], i32 0 ; AVX1-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <8 x float>* -; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP9]], <8 x float>* [[TMP12]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !26, !noalias !28 +; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP9]], <8 x float>* [[TMP12]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !27, !noalias !29 ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; AVX1-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; AVX1-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] +; AVX1-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -812,7 +812,7 @@ ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] ; AVX1: for.end: ; AVX1-NEXT: ret void ; @@ -851,16 +851,16 @@ ; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 ; AVX2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !21 +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !22 ; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8 ; AVX2-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4, !alias.scope !21 +; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4, !alias.scope !22 ; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16 ; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !21 +; AVX2-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !22 ; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 24 ; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4, !alias.scope !21 +; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4, !alias.scope !22 ; AVX2-NEXT: [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], ; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], ; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], @@ -871,16 +871,16 @@ ; AVX2-NEXT: [[TMP23:%.*]] = getelementptr float, float* [[B]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP20]], i32 0 ; AVX2-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x float> poison), !alias.scope !24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x float> poison), !alias.scope !25 ; AVX2-NEXT: [[TMP26:%.*]] = getelementptr float, float* [[TMP20]], i32 8 ; AVX2-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x float> poison), !alias.scope !24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x float> poison), !alias.scope !25 ; AVX2-NEXT: [[TMP28:%.*]] = getelementptr float, float* [[TMP20]], i32 16 ; AVX2-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x float> poison), !alias.scope !24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x float> poison), !alias.scope !25 ; AVX2-NEXT: [[TMP30:%.*]] = getelementptr float, float* [[TMP20]], i32 24 ; AVX2-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x float> poison), !alias.scope !24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x float> poison), !alias.scope !25 ; AVX2-NEXT: [[TMP32:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float> ; AVX2-NEXT: [[TMP33:%.*]] = sitofp <8 x i32> [[WIDE_LOAD12]] to <8 x float> ; AVX2-NEXT: [[TMP34:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x float> @@ -895,19 +895,19 @@ ; AVX2-NEXT: [[TMP43:%.*]] = getelementptr float, float* [[A]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP44:%.*]] = getelementptr float, float* [[TMP40]], i32 0 ; AVX2-NEXT: [[TMP45:%.*]] = bitcast float* [[TMP44]] to <8 x float>* -; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP36]], <8 x float>* [[TMP45]], i32 4, <8 x i1> [[TMP16]]), !alias.scope !26, !noalias !28 +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP36]], <8 x float>* [[TMP45]], i32 4, <8 x i1> [[TMP16]]), !alias.scope !27, !noalias !29 ; AVX2-NEXT: [[TMP46:%.*]] = getelementptr float, float* [[TMP40]], i32 8 ; AVX2-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to <8 x float>* -; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP37]], <8 x float>* [[TMP47]], i32 4, <8 x i1> [[TMP17]]), !alias.scope !26, !noalias !28 +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP37]], <8 x float>* [[TMP47]], i32 4, <8 x i1> [[TMP17]]), !alias.scope !27, !noalias !29 ; AVX2-NEXT: [[TMP48:%.*]] = getelementptr float, float* [[TMP40]], i32 16 ; AVX2-NEXT: [[TMP49:%.*]] = bitcast float* [[TMP48]] to <8 x float>* -; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP38]], <8 x float>* [[TMP49]], i32 4, <8 x i1> [[TMP18]]), !alias.scope !26, !noalias !28 +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP38]], <8 x float>* [[TMP49]], i32 4, <8 x i1> [[TMP18]]), !alias.scope !27, !noalias !29 ; AVX2-NEXT: [[TMP50:%.*]] = getelementptr float, float* [[TMP40]], i32 24 ; AVX2-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP50]] to <8 x float>* -; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP39]], <8 x float>* [[TMP51]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !26, !noalias !28 +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP39]], <8 x float>* [[TMP51]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !27, !noalias !29 ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX2-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX2-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] +; AVX2-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -931,7 +931,7 @@ ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -1162,16 +1162,16 @@ ; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] ; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 ; AVX-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !31 +; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !32 ; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 4 ; AVX-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* -; AVX-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !alias.scope !31 +; AVX-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !alias.scope !32 ; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8 ; AVX-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* -; AVX-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !31 +; AVX-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !32 ; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 12 ; AVX-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>* -; AVX-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15]], align 4, !alias.scope !31 +; AVX-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15]], align 4, !alias.scope !32 ; AVX-NEXT: [[TMP16:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], ; AVX-NEXT: [[TMP17:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD12]], ; AVX-NEXT: [[TMP18:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD13]], @@ -1182,16 +1182,16 @@ ; AVX-NEXT: [[TMP23:%.*]] = getelementptr double, double* [[B]], i64 [[TMP3]] ; AVX-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[TMP20]], i32 0 ; AVX-NEXT: [[TMP25:%.*]] = bitcast double* [[TMP24]] to <4 x double>* -; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP25]], i32 8, <4 x i1> [[TMP16]], <4 x double> poison), !alias.scope !34 +; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP25]], i32 8, <4 x i1> [[TMP16]], <4 x double> poison), !alias.scope !35 ; AVX-NEXT: [[TMP26:%.*]] = getelementptr double, double* [[TMP20]], i32 4 ; AVX-NEXT: [[TMP27:%.*]] = bitcast double* [[TMP26]] to <4 x double>* -; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP27]], i32 8, <4 x i1> [[TMP17]], <4 x double> poison), !alias.scope !34 +; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP27]], i32 8, <4 x i1> [[TMP17]], <4 x double> poison), !alias.scope !35 ; AVX-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP20]], i32 8 ; AVX-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>* -; AVX-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP18]], <4 x double> poison), !alias.scope !34 +; AVX-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP18]], <4 x double> poison), !alias.scope !35 ; AVX-NEXT: [[TMP30:%.*]] = getelementptr double, double* [[TMP20]], i32 12 ; AVX-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>* -; AVX-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP19]], <4 x double> poison), !alias.scope !34 +; AVX-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP19]], <4 x double> poison), !alias.scope !35 ; AVX-NEXT: [[TMP32:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double> ; AVX-NEXT: [[TMP33:%.*]] = sitofp <4 x i32> [[WIDE_LOAD12]] to <4 x double> ; AVX-NEXT: [[TMP34:%.*]] = sitofp <4 x i32> [[WIDE_LOAD13]] to <4 x double> @@ -1206,19 +1206,19 @@ ; AVX-NEXT: [[TMP43:%.*]] = getelementptr double, double* [[A]], i64 [[TMP3]] ; AVX-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[TMP40]], i32 0 ; AVX-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP36]], <4 x double>* [[TMP45]], i32 8, <4 x i1> [[TMP16]]), !alias.scope !36, !noalias !38 +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP36]], <4 x double>* [[TMP45]], i32 8, <4 x i1> [[TMP16]]), !alias.scope !37, !noalias !39 ; AVX-NEXT: [[TMP46:%.*]] = getelementptr double, double* [[TMP40]], i32 4 ; AVX-NEXT: [[TMP47:%.*]] = bitcast double* [[TMP46]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP37]], <4 x double>* [[TMP47]], i32 8, <4 x i1> [[TMP17]]), !alias.scope !36, !noalias !38 +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP37]], <4 x double>* [[TMP47]], i32 8, <4 x i1> [[TMP17]]), !alias.scope !37, !noalias !39 ; AVX-NEXT: [[TMP48:%.*]] = getelementptr double, double* [[TMP40]], i32 8 ; AVX-NEXT: [[TMP49:%.*]] = bitcast double* [[TMP48]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP38]], <4 x double>* [[TMP49]], i32 8, <4 x i1> [[TMP18]]), !alias.scope !36, !noalias !38 +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP38]], <4 x double>* [[TMP49]], i32 8, <4 x i1> [[TMP18]]), !alias.scope !37, !noalias !39 ; AVX-NEXT: [[TMP50:%.*]] = getelementptr double, double* [[TMP40]], i32 12 ; AVX-NEXT: [[TMP51:%.*]] = bitcast double* [[TMP50]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP39]], <4 x double>* [[TMP51]], i32 8, <4 x i1> [[TMP19]]), !alias.scope !36, !noalias !38 +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP39]], <4 x double>* [[TMP51]], i32 8, <4 x i1> [[TMP19]]), !alias.scope !37, !noalias !39 ; AVX-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; AVX-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]] +; AVX-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] ; AVX: middle.block: ; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; AVX-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1242,7 +1242,7 @@ ; AVX: for.inc: ; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] +; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] ; AVX: for.end: ; AVX-NEXT: ret void ; @@ -1669,22 +1669,22 @@ ; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 ; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 -3 ; AVX2-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !41 +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !42 ; AVX2-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> ; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -4 ; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 -3 ; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !41 +; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !42 ; AVX2-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD12]], <4 x i32> poison, <4 x i32> ; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -8 ; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 -3 ; AVX2-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP16]], align 4, !alias.scope !41 +; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP16]], align 4, !alias.scope !42 ; AVX2-NEXT: [[REVERSE15:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD14]], <4 x i32> poison, <4 x i32> ; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -12 ; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 -3 ; AVX2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4, !alias.scope !41 +; AVX2-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4, !alias.scope !42 ; AVX2-NEXT: [[REVERSE17:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD16]], <4 x i32> poison, <4 x i32> ; AVX2-NEXT: [[TMP20:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer ; AVX2-NEXT: [[TMP21:%.*]] = icmp sgt <4 x i32> [[REVERSE13]], zeroinitializer @@ -1698,25 +1698,25 @@ ; AVX2-NEXT: [[TMP29:%.*]] = getelementptr double, double* [[TMP28]], i32 -3 ; AVX2-NEXT: [[REVERSE18:%.*]] = shufflevector <4 x i1> [[TMP20]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP30]], i32 8, <4 x i1> [[REVERSE18]], <4 x double> poison), !alias.scope !44 +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP30]], i32 8, <4 x i1> [[REVERSE18]], <4 x double> poison), !alias.scope !45 ; AVX2-NEXT: [[REVERSE19:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP31:%.*]] = getelementptr double, double* [[TMP24]], i32 -4 ; AVX2-NEXT: [[TMP32:%.*]] = getelementptr double, double* [[TMP31]], i32 -3 ; AVX2-NEXT: [[REVERSE20:%.*]] = shufflevector <4 x i1> [[TMP21]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP33]], i32 8, <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope !44 +; AVX2-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP33]], i32 8, <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope !45 ; AVX2-NEXT: [[REVERSE22:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD21]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP24]], i32 -8 ; AVX2-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP34]], i32 -3 ; AVX2-NEXT: [[REVERSE23:%.*]] = shufflevector <4 x i1> [[TMP22]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE23]], <4 x double> poison), !alias.scope !44 +; AVX2-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE23]], <4 x double> poison), !alias.scope !45 ; AVX2-NEXT: [[REVERSE25:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD24]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP37:%.*]] = getelementptr double, double* [[TMP24]], i32 -12 ; AVX2-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP37]], i32 -3 ; AVX2-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP23]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> poison), !alias.scope !44 +; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> poison), !alias.scope !45 ; AVX2-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD27]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP40:%.*]] = fadd <4 x double> [[REVERSE19]], ; AVX2-NEXT: [[TMP41:%.*]] = fadd <4 x double> [[REVERSE22]], @@ -1730,25 +1730,25 @@ ; AVX2-NEXT: [[TMP48:%.*]] = getelementptr double, double* [[TMP44]], i32 0 ; AVX2-NEXT: [[TMP49:%.*]] = getelementptr double, double* [[TMP48]], i32 -3 ; AVX2-NEXT: [[TMP50:%.*]] = bitcast double* [[TMP49]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE29]], <4 x double>* [[TMP50]], i32 8, <4 x i1> [[REVERSE18]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE29]], <4 x double>* [[TMP50]], i32 8, <4 x i1> [[REVERSE18]]), !alias.scope !47, !noalias !49 ; AVX2-NEXT: [[REVERSE31:%.*]] = shufflevector <4 x double> [[TMP41]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP51:%.*]] = getelementptr double, double* [[TMP44]], i32 -4 ; AVX2-NEXT: [[TMP52:%.*]] = getelementptr double, double* [[TMP51]], i32 -3 ; AVX2-NEXT: [[TMP53:%.*]] = bitcast double* [[TMP52]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE31]], <4 x double>* [[TMP53]], i32 8, <4 x i1> [[REVERSE20]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE31]], <4 x double>* [[TMP53]], i32 8, <4 x i1> [[REVERSE20]]), !alias.scope !47, !noalias !49 ; AVX2-NEXT: [[REVERSE33:%.*]] = shufflevector <4 x double> [[TMP42]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP54:%.*]] = getelementptr double, double* [[TMP44]], i32 -8 ; AVX2-NEXT: [[TMP55:%.*]] = getelementptr double, double* [[TMP54]], i32 -3 ; AVX2-NEXT: [[TMP56:%.*]] = bitcast double* [[TMP55]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE33]], <4 x double>* [[TMP56]], i32 8, <4 x i1> [[REVERSE23]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE33]], <4 x double>* [[TMP56]], i32 8, <4 x i1> [[REVERSE23]]), !alias.scope !47, !noalias !49 ; AVX2-NEXT: [[REVERSE35:%.*]] = shufflevector <4 x double> [[TMP43]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP57:%.*]] = getelementptr double, double* [[TMP44]], i32 -12 ; AVX2-NEXT: [[TMP58:%.*]] = getelementptr double, double* [[TMP57]], i32 -3 ; AVX2-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE35]], <4 x double>* [[TMP59]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE35]], <4 x double>* [[TMP59]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !47, !noalias !49 ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; AVX2-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP49:![0-9]+]] +; AVX2-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1771,7 +1771,7 @@ ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 ; AVX2-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0 -; AVX2-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]] +; AVX2-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP51:![0-9]+]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -2046,7 +2046,7 @@ ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX1-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] +; AVX1-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2072,7 +2072,7 @@ ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] ; AVX1: for.end.loopexit: ; AVX1-NEXT: br label [[FOR_END]] ; AVX1: for.end: @@ -2170,7 +2170,7 @@ ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX2-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP51:![0-9]+]] +; AVX2-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2196,7 +2196,7 @@ ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]] +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP53:![0-9]+]] ; AVX2: for.end.loopexit: ; AVX2-NEXT: br label [[FOR_END]] ; AVX2: for.end: diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll --- a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll @@ -33,7 +33,7 @@ ; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP1]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432 -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -52,7 +52,7 @@ ; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; entry: br label %for.body @@ -104,7 +104,7 @@ ; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP1]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432 -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -123,7 +123,7 @@ ; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP5:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; entry: br label %for.body @@ -191,7 +191,7 @@ ; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP12]], <8 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP13]]) ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] @@ -211,7 +211,7 @@ ; CHECK-NEXT: [[SUM_1]] = add nuw nsw i32 [[ADD]], [[SUM_0]] ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP7:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[SUM_1_LCSSA:%.*]] = phi i32 [ [[SUM_1]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[SUM_1_LCSSA]] @@ -237,12 +237,12 @@ ret i32 %sum.1 } -; CHECK: !0 = distinct !{!0, !1} -; CHECK-NEXT: !1 = !{!"llvm.loop.isvectorized", i32 1} -; CHECK-NEXT: !2 = distinct !{!2, !3, !1} -; CHECK-NEXT: !3 = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK-NEXT: !4 = distinct !{!4, !1} -; CHECK-NEXT: !5 = distinct !{!5, !3, !1} +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[IS_VEC:!.+]], [[NO_RT_UNROLL:!.+]]} +; CHECK-NEXT: [[IS_VEC]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-NEXT: [[NO_RT_UNROLL]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-NEXT: [[LOOP3]] = distinct !{[[LOOP3]], [[IS_VEC]], [[NO_RT_UNROLL]]} +; CHECK-NEXT: [[LOOP4]] = distinct !{[[LOOP4]], [[IS_VEC]], [[NO_RT_UNROLL]]} +; CHECK-NEXT: [[LOOP5]] = distinct !{[[LOOP5]], [[IS_VEC]], [[NO_RT_UNROLL]]} attributes #0 = { nounwind optsize uwtable "target-cpu"="core-avx2" "target-features"="+avx,+avx2" } diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll --- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll +++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization-2.ll @@ -75,7 +75,7 @@ ; CHECK-NEXT: store i32 [[STOREVAL]], i32* [[A]], align 4 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: @@ -147,11 +147,11 @@ ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT]], <4 x i32>* [[TMP3]], align 4, !alias.scope !8, !noalias !11 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[A]], align 4, !alias.scope !11 +; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT]], <4 x i32>* [[TMP3]], align 4, !alias.scope !9, !noalias !12 +; CHECK-NEXT: store i32 [[TMP1]], i32* [[A]], align 4, !alias.scope !12 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -172,7 +172,7 @@ ; CHECK-NEXT: store i32 [[STOREVAL]], i32* [[A]], align 4 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: @@ -232,13 +232,13 @@ ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 8, !alias.scope !15 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 8, !alias.scope !16 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3 -; CHECK-NEXT: store i32 [[TMP2]], i32* [[A]], align 4, !alias.scope !18, !noalias !15 +; CHECK-NEXT: store i32 [[TMP2]], i32* [[A]], align 4, !alias.scope !19, !noalias !16 ; CHECK-NEXT: [[TMP3]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[DOTLCSSA]]) @@ -257,7 +257,7 @@ ; CHECK-NEXT: [[I3]] = add i32 [[I0]], [[I2]] ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: [[I3_LCSSA:%.*]] = phi i32 [ [[I3]], [[FOR_BODY]] ] ; CHECK-NEXT: br label [[FOR_END]] diff --git a/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll b/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll --- a/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll +++ b/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll @@ -67,7 +67,7 @@ ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -82,7 +82,7 @@ ; CHECK-NEXT: store i8 7, i8* [[AJP3]], align 8 ; CHECK-NEXT: [[J_NEXT]] = add nuw nsw i32 [[J]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[J_NEXT]], 15 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !2 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll --- a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll +++ b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll @@ -55,7 +55,7 @@ ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -67,7 +67,7 @@ ; CHECK-NEXT: store i32 13, i32* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], 14 -; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop !2 +; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -117,7 +117,7 @@ ; VF2UF2-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; VF2UF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], ; VF2UF2-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 -; VF2UF2-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; VF2UF2-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VF2UF2: middle.block: ; VF2UF2-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; VF2UF2: scalar.ph: @@ -129,7 +129,7 @@ ; VF2UF2-NEXT: store i32 13, i32* [[ARRAYIDX]], align 1 ; VF2UF2-NEXT: [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1 ; VF2UF2-NEXT: [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], 14 -; VF2UF2-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop !2 +; VF2UF2-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; VF2UF2: exit: ; VF2UF2-NEXT: ret void ; @@ -174,7 +174,7 @@ ; VF1UF4: pred.store.continue9: ; VF1UF4-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; VF1UF4-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 -; VF1UF4-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]] +; VF1UF4-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VF1UF4: middle.block: ; VF1UF4-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; VF1UF4: scalar.ph: @@ -186,7 +186,7 @@ ; VF1UF4-NEXT: store i32 13, i32* [[ARRAYIDX]], align 1 ; VF1UF4-NEXT: [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1 ; VF1UF4-NEXT: [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], 14 -; VF1UF4-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]] +; VF1UF4-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; VF1UF4: exit: ; VF1UF4-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll --- a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll @@ -77,7 +77,7 @@ ; CHECK-NEXT: store i32 [[ADD]], i32* [[GEP_B]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -143,15 +143,15 @@ ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP13]] ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0 ; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP17]], align 4, !alias.scope !8, !noalias !11 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP17]], align 4, !alias.scope !9, !noalias !12 ; CHECK-NEXT: [[TMP18:%.*]] = add <2 x i32> [[WIDE_LOAD]], ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP14]] ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0 ; CHECK-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP18]], <2 x i32>* [[TMP21]], align 4, !alias.scope !11 +; CHECK-NEXT: store <2 x i32> [[TMP18]], <2 x i32>* [[TMP21]], align 4, !alias.scope !12 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -168,7 +168,7 @@ ; CHECK-NEXT: store i32 [[ADD]], i32* [[GEP_A]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -307,7 +307,7 @@ ; CHECK-NEXT: store <2 x i32> [[TMP15]], <2 x i32>* [[TMP16]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -323,7 +323,7 @@ ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; CHECK-NEXT: store i32 [[ADD]], i32* [[GEP_A]], align 4 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll --- a/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll +++ b/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll @@ -23,7 +23,7 @@ ; CHECK-NEXT: [[INDUCTION3:%.*]] = add i64 [[INDEX]], 3 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -35,7 +35,7 @@ ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 15 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !2 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; entry: br label %for.body @@ -70,7 +70,7 @@ ; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr double, double* [[PTR1]], i64 [[TMP3]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !3 +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -82,7 +82,7 @@ ; CHECK-NEXT: [[ADDR:%.*]] = phi double* [ [[PTR:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[PTR]] = getelementptr inbounds double, double* [[ADDR]], i64 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq double* [[PTR]], [[PTR2]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !4 +; CHECK-NEXT: br i1 [[COND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; entry: %ptr2 = getelementptr inbounds double, double* %ptr1, i64 15 diff --git a/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll b/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll --- a/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll +++ b/llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll @@ -39,7 +39,7 @@ ; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP6]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[SIZE]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]] @@ -55,7 +55,7 @@ ; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1 ; CHECK-NEXT: store i8 [[TMP8]], i8* [[BUFF]], align 1 ; CHECK-NEXT: [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0 -; CHECK-NEXT: br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop !2 +; CHECK-NEXT: br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: end: ; CHECK-NEXT: [[INCDEC_PTR_LCSSA:%.*]] = phi i8* [ [[INCDEC_PTR]], [[BODY]] ], [ [[IND_END2]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: store i8* [[INCDEC_PTR_LCSSA]], i8** [[POS]], align 4 @@ -107,7 +107,7 @@ ; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP6]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[SIZE]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]] @@ -123,7 +123,7 @@ ; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1 ; CHECK-NEXT: store i8 [[TMP8]], i8* [[BUFF]], align 1 ; CHECK-NEXT: [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0 -; CHECK-NEXT: br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop !5 +; CHECK-NEXT: br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: end: ; CHECK-NEXT: [[INCDEC_PTR_LCSSA:%.*]] = phi i8* [ [[INCDEC_PTR]], [[BODY]] ], [ [[IND_END2]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: store i8* [[INCDEC_PTR_LCSSA]], i8** [[POS]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/vectorize-once.ll b/llvm/test/Transforms/LoopVectorize/vectorize-once.ll --- a/llvm/test/Transforms/LoopVectorize/vectorize-once.ll +++ b/llvm/test/Transforms/LoopVectorize/vectorize-once.ll @@ -9,8 +9,11 @@ ; This test checks that we add metadata to vectorized loops ; CHECK-LABEL: @_Z4foo1Pii( +; CHECK-LABEL: vector.body: ; CHECK: <4 x i32> -; CHECK: llvm.loop +; CHECK: br i1 {{.+}}, label %middle.block, label %vector.body, !llvm.loop [[VEC_LOOP_1_ID:!.+]] +; CHECK-LABEL: for.body.i: +; CHECK: br i1 {{.+}}, label %_ZSt10accumulateIPiiET0_T_S2_S1_.exit, label %for.body.i, !llvm.loop [[SCALAR_LOOP_1_ID:!.+]] ; CHECK: ret ; This test comes from the loop: @@ -67,10 +70,10 @@ attributes #0 = { nounwind readonly ssp uwtable "fp-contract-model"="standard" "frame-pointer"="non-leaf" "realign-stack" "relocation-model"="pic" "ssp-buffers-size"="8" } -; CHECK: !0 = distinct !{!0, !1} -; CHECK: !1 = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: !2 = distinct !{!2, !3, !1} -; CHECK: !3 = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[VEC_LOOP_1_ID]] = distinct !{[[VEC_LOOP_1_ID]], [[IS_VEC:!.+]], [[NO_RT_UNROLL:!.+]]} +; CHECK: [[IS_VEC]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[NO_RT_UNROLL]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[SCALAR_LOOP_1_ID]] = distinct !{[[SCALAR_LOOP_1_ID]], [[IS_VEC]], [[NO_RT_UNROLL]]} !0 = !{!0, !1} !1 = !{!"llvm.loop.vectorize.width", i32 1}