diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -516,6 +516,41 @@ void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) { + bool IsVectorLoop = false; + if (auto IsVectorized = + findStringMetadataForLoop(L, "llvm.loop.isvectorized")) + IsVectorLoop = + mdconst::extract(**IsVectorized)->getZExtValue(); + + // Scan the loop: don't unroll loops with calls. + for (BasicBlock *BB : L->blocks()) { + for (Instruction &I : *BB) { + if (IsVectorLoop && I.getType()->isVectorTy()) { + UP.Threshold = 0; + UP.PartialThreshold = 0; + return; + } + + if (isa(I) || isa(I)) { + if (const Function *F = cast(I).getCalledFunction()) { + if (!thisT()->isLoweredToCall(F)) + continue; + } + + if (ORE) { + ORE->emit([&]() { + return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(), + L->getHeader()) + << "advising against unrolling the loop because it " + "contains a " + << ore::NV("Call", &I); + }); + } + return; + } + } + } + // This unrolling functionality is target independent, but to provide some // motivation for its intended use, for x86: @@ -548,29 +583,6 @@ else return; - // Scan the loop: don't unroll loops with calls. - for (BasicBlock *BB : L->blocks()) { - for (Instruction &I : *BB) { - if (isa(I) || isa(I)) { - if (const Function *F = cast(I).getCalledFunction()) { - if (!thisT()->isLoweredToCall(F)) - continue; - } - - if (ORE) { - ORE->emit([&]() { - return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(), - L->getHeader()) - << "advising against unrolling the loop because it " - "contains a " - << ore::NV("Call", &I); - }); - } - return; - } - } - } - // Enable runtime and partial unrolling up to the specified size. // Enable using trip count upper bound to unroll loops. UP.Partial = UP.Runtime = UP.UpperBound = true; diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll --- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll @@ -25,122 +25,32 @@ ; AUTO_VEC-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] ; AUTO_VEC: vector.ph: ; AUTO_VEC-NEXT: [[N_VEC:%.*]] = and i64 [[ZEXT]], 4294967264 -; AUTO_VEC-NEXT: [[CAST_CRD:%.*]] = sitofp i64 [[N_VEC]] to float -; AUTO_VEC-NEXT: [[TMP0:%.*]] = fmul fast float [[CAST_CRD]], 5.000000e-01 +; AUTO_VEC-NEXT: [[CAST_VTC:%.*]] = sitofp i64 [[N_VEC]] to float +; AUTO_VEC-NEXT: [[TMP0:%.*]] = fmul fast float [[CAST_VTC]], 5.000000e-01 ; AUTO_VEC-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP0]], 1.000000e+00 -; AUTO_VEC-NEXT: [[TMP1:%.*]] = add nsw i64 [[N_VEC]], -32 -; AUTO_VEC-NEXT: [[TMP2:%.*]] = lshr exact i64 [[TMP1]], 5 -; AUTO_VEC-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 -; AUTO_VEC-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP3]], 3 -; AUTO_VEC-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP1]], 96 -; AUTO_VEC-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]] -; AUTO_VEC: vector.ph.new: -; AUTO_VEC-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP3]], 1152921504606846972 ; AUTO_VEC-NEXT: br label [[VECTOR_BODY:%.*]] ; AUTO_VEC: vector.body: -; AUTO_VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[VEC_IND:%.*]] = phi <8 x float> [ , [[VECTOR_PH_NEW]] ], [ [[VEC_IND_NEXT_3:%.*]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[NITER:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[NITER_NEXT_3:%.*]], [[VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[VEC_IND:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; AUTO_VEC-NEXT: [[STEP_ADD:%.*]] = fadd fast <8 x float> [[VEC_IND]], ; AUTO_VEC-NEXT: [[STEP_ADD2:%.*]] = fadd fast <8 x float> [[VEC_IND]], ; AUTO_VEC-NEXT: [[STEP_ADD3:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] +; AUTO_VEC-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] +; AUTO_VEC-NEXT: [[TMP2:%.*]] = bitcast float* [[TMP1]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[VEC_IND]], <8 x float>* [[TMP2]], align 4 +; AUTO_VEC-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; AUTO_VEC-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD]], <8 x float>* [[TMP4]], align 4 +; AUTO_VEC-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 16 ; AUTO_VEC-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[VEC_IND]], <8 x float>* [[TMP6]], align 4 -; AUTO_VEC-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP5]], i64 8 +; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD2]], <8 x float>* [[TMP6]], align 4 +; AUTO_VEC-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 24 ; AUTO_VEC-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD]], <8 x float>* [[TMP8]], align 4 -; AUTO_VEC-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP5]], i64 16 -; AUTO_VEC-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD2]], <8 x float>* [[TMP10]], align 4 -; AUTO_VEC-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP5]], i64 24 -; AUTO_VEC-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD3]], <8 x float>* [[TMP12]], align 4 -; AUTO_VEC-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 32 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD_1:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD2_1:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD3_1:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX_NEXT]] -; AUTO_VEC-NEXT: [[TMP14:%.*]] = bitcast float* [[TMP13]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[VEC_IND_NEXT]], <8 x float>* [[TMP14]], align 4 -; AUTO_VEC-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP13]], i64 8 -; AUTO_VEC-NEXT: [[TMP16:%.*]] = bitcast float* [[TMP15]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD_1]], <8 x float>* [[TMP16]], align 4 -; AUTO_VEC-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP13]], i64 16 -; AUTO_VEC-NEXT: [[TMP18:%.*]] = bitcast float* [[TMP17]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD2_1]], <8 x float>* [[TMP18]], align 4 -; AUTO_VEC-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP13]], i64 24 -; AUTO_VEC-NEXT: [[TMP20:%.*]] = bitcast float* [[TMP19]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD3_1]], <8 x float>* [[TMP20]], align 4 -; AUTO_VEC-NEXT: [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX]], 64 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT_1:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD_2:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD2_2:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD3_2:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX_NEXT_1]] -; AUTO_VEC-NEXT: [[TMP22:%.*]] = bitcast float* [[TMP21]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[VEC_IND_NEXT_1]], <8 x float>* [[TMP22]], align 4 -; AUTO_VEC-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[TMP21]], i64 8 -; AUTO_VEC-NEXT: [[TMP24:%.*]] = bitcast float* [[TMP23]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD_2]], <8 x float>* [[TMP24]], align 4 -; AUTO_VEC-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP21]], i64 16 -; AUTO_VEC-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP25]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD2_2]], <8 x float>* [[TMP26]], align 4 -; AUTO_VEC-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP21]], i64 24 -; AUTO_VEC-NEXT: [[TMP28:%.*]] = bitcast float* [[TMP27]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD3_2]], <8 x float>* [[TMP28]], align 4 -; AUTO_VEC-NEXT: [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX]], 96 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT_2:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD_3:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD2_3:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD3_3:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX_NEXT_2]] -; AUTO_VEC-NEXT: [[TMP30:%.*]] = bitcast float* [[TMP29]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[VEC_IND_NEXT_2]], <8 x float>* [[TMP30]], align 4 -; AUTO_VEC-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP29]], i64 8 -; AUTO_VEC-NEXT: [[TMP32:%.*]] = bitcast float* [[TMP31]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD_3]], <8 x float>* [[TMP32]], align 4 -; AUTO_VEC-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP29]], i64 16 -; AUTO_VEC-NEXT: [[TMP34:%.*]] = bitcast float* [[TMP33]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD2_3]], <8 x float>* [[TMP34]], align 4 -; AUTO_VEC-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, float* [[TMP29]], i64 24 -; AUTO_VEC-NEXT: [[TMP36:%.*]] = bitcast float* [[TMP35]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD3_3]], <8 x float>* [[TMP36]], align 4 -; AUTO_VEC-NEXT: [[INDEX_NEXT_3]] = add nuw i64 [[INDEX]], 128 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT_3]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[NITER_NEXT_3]] = add i64 [[NITER]], 4 -; AUTO_VEC-NEXT: [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NEXT_3]], [[UNROLL_ITER]] -; AUTO_VEC-NEXT: br i1 [[NITER_NCMP_3]], label [[MIDDLE_BLOCK_UNR_LCSSA]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; AUTO_VEC: middle.block.unr-lcssa: -; AUTO_VEC-NEXT: [[INDEX_UNR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_3]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[VEC_IND_UNR:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_3]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 -; AUTO_VEC-NEXT: br i1 [[LCMP_MOD_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY_EPIL:%.*]] -; AUTO_VEC: vector.body.epil: -; AUTO_VEC-NEXT: [[INDEX_EPIL:%.*]] = phi i64 [ [[INDEX_NEXT_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[INDEX_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; AUTO_VEC-NEXT: [[VEC_IND_EPIL:%.*]] = phi <8 x float> [ [[VEC_IND_NEXT_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[VEC_IND_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; AUTO_VEC-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ [[EPIL_ITER_NEXT:%.*]], [[VECTOR_BODY_EPIL]] ], [ 0, [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; AUTO_VEC-NEXT: [[STEP_ADD_EPIL:%.*]] = fadd fast <8 x float> [[VEC_IND_EPIL]], -; AUTO_VEC-NEXT: [[STEP_ADD2_EPIL:%.*]] = fadd fast <8 x float> [[VEC_IND_EPIL]], -; AUTO_VEC-NEXT: [[STEP_ADD3_EPIL:%.*]] = fadd fast <8 x float> [[VEC_IND_EPIL]], -; AUTO_VEC-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX_EPIL]] -; AUTO_VEC-NEXT: [[TMP38:%.*]] = bitcast float* [[TMP37]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[VEC_IND_EPIL]], <8 x float>* [[TMP38]], align 4 -; AUTO_VEC-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[TMP37]], i64 8 -; AUTO_VEC-NEXT: [[TMP40:%.*]] = bitcast float* [[TMP39]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD_EPIL]], <8 x float>* [[TMP40]], align 4 -; AUTO_VEC-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[TMP37]], i64 16 -; AUTO_VEC-NEXT: [[TMP42:%.*]] = bitcast float* [[TMP41]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD2_EPIL]], <8 x float>* [[TMP42]], align 4 -; AUTO_VEC-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, float* [[TMP37]], i64 24 -; AUTO_VEC-NEXT: [[TMP44:%.*]] = bitcast float* [[TMP43]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD3_EPIL]], <8 x float>* [[TMP44]], align 4 -; AUTO_VEC-NEXT: [[INDEX_NEXT_EPIL]] = add nuw i64 [[INDEX_EPIL]], 32 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT_EPIL]] = fadd fast <8 x float> [[VEC_IND_EPIL]], -; AUTO_VEC-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1 -; AUTO_VEC-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_NEXT]], [[XTRAITER]] -; AUTO_VEC-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY_EPIL]], !llvm.loop [[LOOP2:![0-9]+]] +; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD3]], <8 x float>* [[TMP8]], align 4 +; AUTO_VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; AUTO_VEC-NEXT: [[VEC_IND_NEXT]] = fadd fast <8 x float> [[VEC_IND]], +; AUTO_VEC-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AUTO_VEC-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AUTO_VEC: middle.block: ; AUTO_VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[ZEXT]] ; AUTO_VEC-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY]] @@ -151,8 +61,8 @@ ; AUTO_VEC-NEXT: store float [[X_06]], float* [[ARRAYIDX]], align 4 ; AUTO_VEC-NEXT: [[CONV1]] = fadd fast float [[X_06]], 5.000000e-01 ; AUTO_VEC-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; AUTO_VEC-NEXT: [[TMP45:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]] -; AUTO_VEC-NEXT: br i1 [[TMP45]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; AUTO_VEC-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]] +; AUTO_VEC-NEXT: br i1 [[TMP10]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; AUTO_VEC: for.end: ; AUTO_VEC-NEXT: ret void ; @@ -259,7 +169,7 @@ ; AUTO_VEC-NEXT: [[INDVARS_IV_NEXT_EPIL]] = add nuw nsw i64 [[INDVARS_IV_EPIL]], 1 ; AUTO_VEC-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1 ; AUTO_VEC-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_NEXT]], [[XTRAITER]] -; AUTO_VEC-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_END]], label [[FOR_BODY_EPIL]], !llvm.loop [[LOOP6:![0-9]+]] +; AUTO_VEC-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_END]], label [[FOR_BODY_EPIL]], !llvm.loop [[LOOP4:![0-9]+]] ; AUTO_VEC: for.end: ; AUTO_VEC-NEXT: ret void ; @@ -296,126 +206,36 @@ ; AUTO_VEC-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] ; AUTO_VEC: vector.ph: ; AUTO_VEC-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792 -; AUTO_VEC-NEXT: [[CAST_CRD:%.*]] = sitofp i64 [[N_VEC]] to double -; AUTO_VEC-NEXT: [[TMP0:%.*]] = fmul fast double [[CAST_CRD]], 3.000000e+00 -; AUTO_VEC-NEXT: [[TMP1:%.*]] = add nsw i64 [[N_VEC]], -16 -; AUTO_VEC-NEXT: [[TMP2:%.*]] = lshr exact i64 [[TMP1]], 4 -; AUTO_VEC-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 -; AUTO_VEC-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP3]], 3 -; AUTO_VEC-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP1]], 48 -; AUTO_VEC-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]] -; AUTO_VEC: vector.ph.new: -; AUTO_VEC-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP3]], 2305843009213693948 +; AUTO_VEC-NEXT: [[CAST_VTC:%.*]] = sitofp i64 [[N_VEC]] to double +; AUTO_VEC-NEXT: [[TMP0:%.*]] = fmul fast double [[CAST_VTC]], 3.000000e+00 ; AUTO_VEC-NEXT: br label [[VECTOR_BODY:%.*]] ; AUTO_VEC: vector.body: -; AUTO_VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[VEC_IND:%.*]] = phi <4 x double> [ , [[VECTOR_PH_NEW]] ], [ [[VEC_IND_NEXT_3:%.*]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[NITER:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[NITER_NEXT_3:%.*]], [[VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[VEC_IND:%.*]] = phi <4 x double> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; AUTO_VEC-NEXT: [[STEP_ADD:%.*]] = fadd fast <4 x double> [[VEC_IND]], ; AUTO_VEC-NEXT: [[STEP_ADD2:%.*]] = fadd fast <4 x double> [[VEC_IND]], ; AUTO_VEC-NEXT: [[STEP_ADD3:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[TMP5:%.*]] = getelementptr double, double* [[A:%.*]], i64 [[INDEX]] +; AUTO_VEC-NEXT: [[TMP1:%.*]] = getelementptr double, double* [[A:%.*]], i64 [[INDEX]] +; AUTO_VEC-NEXT: [[TMP2:%.*]] = bitcast double* [[TMP1]] to <4 x double>* +; AUTO_VEC-NEXT: store <4 x double> [[VEC_IND]], <4 x double>* [[TMP2]], align 8 +; AUTO_VEC-NEXT: [[TMP3:%.*]] = getelementptr double, double* [[TMP1]], i64 4 +; AUTO_VEC-NEXT: [[TMP4:%.*]] = bitcast double* [[TMP3]] to <4 x double>* +; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD]], <4 x double>* [[TMP4]], align 8 +; AUTO_VEC-NEXT: [[TMP5:%.*]] = getelementptr double, double* [[TMP1]], i64 8 ; AUTO_VEC-NEXT: [[TMP6:%.*]] = bitcast double* [[TMP5]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[VEC_IND]], <4 x double>* [[TMP6]], align 8 -; AUTO_VEC-NEXT: [[TMP7:%.*]] = getelementptr double, double* [[TMP5]], i64 4 +; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD2]], <4 x double>* [[TMP6]], align 8 +; AUTO_VEC-NEXT: [[TMP7:%.*]] = getelementptr double, double* [[TMP1]], i64 12 ; AUTO_VEC-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD]], <4 x double>* [[TMP8]], align 8 -; AUTO_VEC-NEXT: [[TMP9:%.*]] = getelementptr double, double* [[TMP5]], i64 8 -; AUTO_VEC-NEXT: [[TMP10:%.*]] = bitcast double* [[TMP9]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD2]], <4 x double>* [[TMP10]], align 8 -; AUTO_VEC-NEXT: [[TMP11:%.*]] = getelementptr double, double* [[TMP5]], i64 12 -; AUTO_VEC-NEXT: [[TMP12:%.*]] = bitcast double* [[TMP11]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD3]], <4 x double>* [[TMP12]], align 8 -; AUTO_VEC-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 16 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD_1:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD2_1:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD3_1:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[A]], i64 [[INDEX_NEXT]] -; AUTO_VEC-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[VEC_IND_NEXT]], <4 x double>* [[TMP14]], align 8 -; AUTO_VEC-NEXT: [[TMP15:%.*]] = getelementptr double, double* [[TMP13]], i64 4 -; AUTO_VEC-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP15]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD_1]], <4 x double>* [[TMP16]], align 8 -; AUTO_VEC-NEXT: [[TMP17:%.*]] = getelementptr double, double* [[TMP13]], i64 8 -; AUTO_VEC-NEXT: [[TMP18:%.*]] = bitcast double* [[TMP17]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD2_1]], <4 x double>* [[TMP18]], align 8 -; AUTO_VEC-NEXT: [[TMP19:%.*]] = getelementptr double, double* [[TMP13]], i64 12 -; AUTO_VEC-NEXT: [[TMP20:%.*]] = bitcast double* [[TMP19]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD3_1]], <4 x double>* [[TMP20]], align 8 -; AUTO_VEC-NEXT: [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX]], 32 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT_1:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD_2:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD2_2:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD3_2:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[TMP21:%.*]] = getelementptr double, double* [[A]], i64 [[INDEX_NEXT_1]] -; AUTO_VEC-NEXT: [[TMP22:%.*]] = bitcast double* [[TMP21]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[VEC_IND_NEXT_1]], <4 x double>* [[TMP22]], align 8 -; AUTO_VEC-NEXT: [[TMP23:%.*]] = getelementptr double, double* [[TMP21]], i64 4 -; AUTO_VEC-NEXT: [[TMP24:%.*]] = bitcast double* [[TMP23]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD_2]], <4 x double>* [[TMP24]], align 8 -; AUTO_VEC-NEXT: [[TMP25:%.*]] = getelementptr double, double* [[TMP21]], i64 8 -; AUTO_VEC-NEXT: [[TMP26:%.*]] = bitcast double* [[TMP25]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD2_2]], <4 x double>* [[TMP26]], align 8 -; AUTO_VEC-NEXT: [[TMP27:%.*]] = getelementptr double, double* [[TMP21]], i64 12 -; AUTO_VEC-NEXT: [[TMP28:%.*]] = bitcast double* [[TMP27]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD3_2]], <4 x double>* [[TMP28]], align 8 -; AUTO_VEC-NEXT: [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX]], 48 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT_2:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD_3:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD2_3:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD3_3:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[TMP29:%.*]] = getelementptr double, double* [[A]], i64 [[INDEX_NEXT_2]] -; AUTO_VEC-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[VEC_IND_NEXT_2]], <4 x double>* [[TMP30]], align 8 -; AUTO_VEC-NEXT: [[TMP31:%.*]] = getelementptr double, double* [[TMP29]], i64 4 -; AUTO_VEC-NEXT: [[TMP32:%.*]] = bitcast double* [[TMP31]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD_3]], <4 x double>* [[TMP32]], align 8 -; AUTO_VEC-NEXT: [[TMP33:%.*]] = getelementptr double, double* [[TMP29]], i64 8 -; AUTO_VEC-NEXT: [[TMP34:%.*]] = bitcast double* [[TMP33]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD2_3]], <4 x double>* [[TMP34]], align 8 -; AUTO_VEC-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP29]], i64 12 -; AUTO_VEC-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD3_3]], <4 x double>* [[TMP36]], align 8 -; AUTO_VEC-NEXT: [[INDEX_NEXT_3]] = add nuw i64 [[INDEX]], 64 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT_3]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[NITER_NEXT_3]] = add i64 [[NITER]], 4 -; AUTO_VEC-NEXT: [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NEXT_3]], [[UNROLL_ITER]] -; AUTO_VEC-NEXT: br i1 [[NITER_NCMP_3]], label [[MIDDLE_BLOCK_UNR_LCSSA]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] -; AUTO_VEC: middle.block.unr-lcssa: -; AUTO_VEC-NEXT: [[INDEX_UNR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_3]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[VEC_IND_UNR:%.*]] = phi <4 x double> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_3]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 -; AUTO_VEC-NEXT: br i1 [[LCMP_MOD_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY_EPIL:%.*]] -; AUTO_VEC: vector.body.epil: -; AUTO_VEC-NEXT: [[INDEX_EPIL:%.*]] = phi i64 [ [[INDEX_NEXT_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[INDEX_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; AUTO_VEC-NEXT: [[VEC_IND_EPIL:%.*]] = phi <4 x double> [ [[VEC_IND_NEXT_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[VEC_IND_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; AUTO_VEC-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ [[EPIL_ITER_NEXT:%.*]], [[VECTOR_BODY_EPIL]] ], [ 0, [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; AUTO_VEC-NEXT: [[STEP_ADD_EPIL:%.*]] = fadd fast <4 x double> [[VEC_IND_EPIL]], -; AUTO_VEC-NEXT: [[STEP_ADD2_EPIL:%.*]] = fadd fast <4 x double> [[VEC_IND_EPIL]], -; AUTO_VEC-NEXT: [[STEP_ADD3_EPIL:%.*]] = fadd fast <4 x double> [[VEC_IND_EPIL]], -; AUTO_VEC-NEXT: [[TMP37:%.*]] = getelementptr double, double* [[A]], i64 [[INDEX_EPIL]] -; AUTO_VEC-NEXT: [[TMP38:%.*]] = bitcast double* [[TMP37]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[VEC_IND_EPIL]], <4 x double>* [[TMP38]], align 8 -; AUTO_VEC-NEXT: [[TMP39:%.*]] = getelementptr double, double* [[TMP37]], i64 4 -; AUTO_VEC-NEXT: [[TMP40:%.*]] = bitcast double* [[TMP39]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD_EPIL]], <4 x double>* [[TMP40]], align 8 -; AUTO_VEC-NEXT: [[TMP41:%.*]] = getelementptr double, double* [[TMP37]], i64 8 -; AUTO_VEC-NEXT: [[TMP42:%.*]] = bitcast double* [[TMP41]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD2_EPIL]], <4 x double>* [[TMP42]], align 8 -; AUTO_VEC-NEXT: [[TMP43:%.*]] = getelementptr double, double* [[TMP37]], i64 12 -; AUTO_VEC-NEXT: [[TMP44:%.*]] = bitcast double* [[TMP43]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD3_EPIL]], <4 x double>* [[TMP44]], align 8 -; AUTO_VEC-NEXT: [[INDEX_NEXT_EPIL]] = add nuw i64 [[INDEX_EPIL]], 16 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT_EPIL]] = fadd fast <4 x double> [[VEC_IND_EPIL]], -; AUTO_VEC-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1 -; AUTO_VEC-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_NEXT]], [[XTRAITER]] -; AUTO_VEC-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY_EPIL]], !llvm.loop [[LOOP8:![0-9]+]] +; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD3]], <4 x double>* [[TMP8]], align 8 +; AUTO_VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; AUTO_VEC-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x double> [[VEC_IND]], +; AUTO_VEC-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AUTO_VEC-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; AUTO_VEC: middle.block: ; AUTO_VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] -; AUTO_VEC-NEXT: [[TMP45:%.*]] = add nsw i64 [[N_VEC]], -1 -; AUTO_VEC-NEXT: [[CAST_CMO:%.*]] = sitofp i64 [[TMP45]] to double -; AUTO_VEC-NEXT: [[TMP46:%.*]] = fmul fast double [[CAST_CMO]], 3.000000e+00 +; AUTO_VEC-NEXT: [[TMP10:%.*]] = add nsw i64 [[N_VEC]], -1 +; AUTO_VEC-NEXT: [[CAST_CMO:%.*]] = sitofp i64 [[TMP10]] to double +; AUTO_VEC-NEXT: [[TMP11:%.*]] = fmul fast double [[CAST_CMO]], 3.000000e+00 ; AUTO_VEC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; AUTO_VEC: for.body: ; AUTO_VEC-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] @@ -425,9 +245,9 @@ ; AUTO_VEC-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; AUTO_VEC-NEXT: [[J_NEXT]] = fadd fast double [[J]], 3.000000e+00 ; AUTO_VEC-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_NEXT]], [[SMAX]] -; AUTO_VEC-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; AUTO_VEC-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; AUTO_VEC: for.end: -; AUTO_VEC-NEXT: [[J_LCSSA:%.*]] = phi double [ [[TMP46]], [[MIDDLE_BLOCK]] ], [ [[J]], [[FOR_BODY]] ] +; AUTO_VEC-NEXT: [[J_LCSSA:%.*]] = phi double [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ [[J]], [[FOR_BODY]] ] ; AUTO_VEC-NEXT: ret double [[J_LCSSA]] ; entry: @@ -514,7 +334,7 @@ ; AUTO_VEC-NEXT: [[J_NEXT_EPIL]] = fadd double [[J_EPIL]], 3.000000e+00 ; AUTO_VEC-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1 ; AUTO_VEC-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_NEXT]], [[XTRAITER]] -; AUTO_VEC-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_END]], label [[FOR_BODY_EPIL]], !llvm.loop [[LOOP10:![0-9]+]] +; AUTO_VEC-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_END]], label [[FOR_BODY_EPIL]], !llvm.loop [[LOOP8:![0-9]+]] ; AUTO_VEC: for.end: ; AUTO_VEC-NEXT: [[J_LCSSA:%.*]] = phi double [ [[J_LCSSA_PH]], [[FOR_END_UNR_LCSSA]] ], [ [[J_EPIL]], [[FOR_BODY_EPIL]] ] ; AUTO_VEC-NEXT: ret double [[J_LCSSA]] @@ -556,117 +376,44 @@ ; AUTO_VEC-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] ; AUTO_VEC: vector.ph: ; AUTO_VEC-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967264 -; AUTO_VEC-NEXT: [[CAST_CRD:%.*]] = sitofp i64 [[N_VEC]] to float -; AUTO_VEC-NEXT: [[TMP1:%.*]] = fmul reassoc float [[CAST_CRD]], 4.200000e+01 +; AUTO_VEC-NEXT: [[CAST_VTC:%.*]] = sitofp i64 [[N_VEC]] to float +; AUTO_VEC-NEXT: [[TMP1:%.*]] = fmul reassoc float [[CAST_VTC]], 4.200000e+01 ; AUTO_VEC-NEXT: [[IND_END:%.*]] = fadd reassoc float [[TMP1]], 1.000000e+00 -; AUTO_VEC-NEXT: [[TMP2:%.*]] = add nsw i64 [[N_VEC]], -32 -; AUTO_VEC-NEXT: [[TMP3:%.*]] = lshr exact i64 [[TMP2]], 5 -; AUTO_VEC-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1 -; AUTO_VEC-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP4]], 1 -; AUTO_VEC-NEXT: [[TMP5:%.*]] = icmp eq i64 [[TMP2]], 0 -; AUTO_VEC-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]] -; AUTO_VEC: vector.ph.new: -; AUTO_VEC-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP4]], 1152921504606846974 ; AUTO_VEC-NEXT: br label [[VECTOR_BODY:%.*]] ; AUTO_VEC: vector.body: -; AUTO_VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[VEC_IND:%.*]] = phi <8 x float> [ , [[VECTOR_PH_NEW]] ], [ [[VEC_IND_NEXT_1:%.*]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[NITER:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[VEC_IND:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; AUTO_VEC-NEXT: [[STEP_ADD:%.*]] = fadd reassoc <8 x float> [[VEC_IND]], ; AUTO_VEC-NEXT: [[STEP_ADD2:%.*]] = fadd reassoc <8 x float> [[STEP_ADD]], ; AUTO_VEC-NEXT: [[STEP_ADD3:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2]], -; AUTO_VEC-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 [[INDEX]] +; AUTO_VEC-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 [[INDEX]] +; AUTO_VEC-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* +; AUTO_VEC-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4 +; AUTO_VEC-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 8 +; AUTO_VEC-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <8 x float>* +; AUTO_VEC-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>* [[TMP5]], align 4 +; AUTO_VEC-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 16 ; AUTO_VEC-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP7]], align 4 -; AUTO_VEC-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i64 8 +; AUTO_VEC-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>* [[TMP7]], align 4 +; AUTO_VEC-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 24 ; AUTO_VEC-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4 -; AUTO_VEC-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i64 16 -; AUTO_VEC-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4 -; AUTO_VEC-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP6]], i64 24 -; AUTO_VEC-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4 -; AUTO_VEC-NEXT: [[TMP14:%.*]] = fadd reassoc <8 x float> [[VEC_IND]], [[WIDE_LOAD]] -; AUTO_VEC-NEXT: [[TMP15:%.*]] = fadd reassoc <8 x float> [[STEP_ADD]], [[WIDE_LOAD5]] -; AUTO_VEC-NEXT: [[TMP16:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2]], [[WIDE_LOAD6]] -; AUTO_VEC-NEXT: [[TMP17:%.*]] = fadd reassoc <8 x float> [[STEP_ADD3]], [[WIDE_LOAD7]] -; AUTO_VEC-NEXT: [[TMP18:%.*]] = bitcast float* [[TMP6]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP14]], <8 x float>* [[TMP18]], align 4 -; AUTO_VEC-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP8]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP15]], <8 x float>* [[TMP19]], align 4 -; AUTO_VEC-NEXT: [[TMP20:%.*]] = bitcast float* [[TMP10]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP16]], <8 x float>* [[TMP20]], align 4 -; AUTO_VEC-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP12]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP17]], <8 x float>* [[TMP21]], align 4 -; AUTO_VEC-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 32 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT:%.*]] = fadd reassoc <8 x float> [[STEP_ADD3]], -; AUTO_VEC-NEXT: [[STEP_ADD_1:%.*]] = fadd reassoc <8 x float> [[VEC_IND_NEXT]], -; AUTO_VEC-NEXT: [[STEP_ADD2_1:%.*]] = fadd reassoc <8 x float> [[STEP_ADD_1]], -; AUTO_VEC-NEXT: [[STEP_ADD3_1:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2_1]], -; AUTO_VEC-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[P]], i64 [[INDEX_NEXT]] -; AUTO_VEC-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP22]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD_1:%.*]] = load <8 x float>, <8 x float>* [[TMP23]], align 4 -; AUTO_VEC-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP22]], i64 8 -; AUTO_VEC-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD5_1:%.*]] = load <8 x float>, <8 x float>* [[TMP25]], align 4 -; AUTO_VEC-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP22]], i64 16 -; AUTO_VEC-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD6_1:%.*]] = load <8 x float>, <8 x float>* [[TMP27]], align 4 -; AUTO_VEC-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP22]], i64 24 -; AUTO_VEC-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD7_1:%.*]] = load <8 x float>, <8 x float>* [[TMP29]], align 4 -; AUTO_VEC-NEXT: [[TMP30:%.*]] = fadd reassoc <8 x float> [[VEC_IND_NEXT]], [[WIDE_LOAD_1]] -; AUTO_VEC-NEXT: [[TMP31:%.*]] = fadd reassoc <8 x float> [[STEP_ADD_1]], [[WIDE_LOAD5_1]] -; AUTO_VEC-NEXT: [[TMP32:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2_1]], [[WIDE_LOAD6_1]] -; AUTO_VEC-NEXT: [[TMP33:%.*]] = fadd reassoc <8 x float> [[STEP_ADD3_1]], [[WIDE_LOAD7_1]] -; AUTO_VEC-NEXT: [[TMP34:%.*]] = bitcast float* [[TMP22]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP30]], <8 x float>* [[TMP34]], align 4 -; AUTO_VEC-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP24]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP31]], <8 x float>* [[TMP35]], align 4 -; AUTO_VEC-NEXT: [[TMP36:%.*]] = bitcast float* [[TMP26]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP32]], <8 x float>* [[TMP36]], align 4 -; AUTO_VEC-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP28]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP33]], <8 x float>* [[TMP37]], align 4 -; AUTO_VEC-NEXT: [[INDEX_NEXT_1]] = add nuw i64 [[INDEX]], 64 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT_1]] = fadd reassoc <8 x float> [[STEP_ADD3_1]], -; AUTO_VEC-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2 -; AUTO_VEC-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]] -; AUTO_VEC-NEXT: br i1 [[NITER_NCMP_1]], label [[MIDDLE_BLOCK_UNR_LCSSA]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] -; AUTO_VEC: middle.block.unr-lcssa: -; AUTO_VEC-NEXT: [[INDEX_UNR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_1]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[VEC_IND_UNR:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_1]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 -; AUTO_VEC-NEXT: br i1 [[LCMP_MOD_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY_EPIL:%.*]] -; AUTO_VEC: vector.body.epil: -; AUTO_VEC-NEXT: [[STEP_ADD_EPIL:%.*]] = fadd reassoc <8 x float> [[VEC_IND_UNR]], -; AUTO_VEC-NEXT: [[STEP_ADD2_EPIL:%.*]] = fadd reassoc <8 x float> [[STEP_ADD_EPIL]], -; AUTO_VEC-NEXT: [[STEP_ADD3_EPIL:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2_EPIL]], -; AUTO_VEC-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[P]], i64 [[INDEX_UNR]] -; AUTO_VEC-NEXT: [[TMP39:%.*]] = bitcast float* [[TMP38]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD_EPIL:%.*]] = load <8 x float>, <8 x float>* [[TMP39]], align 4 -; AUTO_VEC-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, float* [[TMP38]], i64 8 -; AUTO_VEC-NEXT: [[TMP41:%.*]] = bitcast float* [[TMP40]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD5_EPIL:%.*]] = load <8 x float>, <8 x float>* [[TMP41]], align 4 -; AUTO_VEC-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, float* [[TMP38]], i64 16 -; AUTO_VEC-NEXT: [[TMP43:%.*]] = bitcast float* [[TMP42]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD6_EPIL:%.*]] = load <8 x float>, <8 x float>* [[TMP43]], align 4 -; AUTO_VEC-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[TMP38]], i64 24 -; AUTO_VEC-NEXT: [[TMP45:%.*]] = bitcast float* [[TMP44]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD7_EPIL:%.*]] = load <8 x float>, <8 x float>* [[TMP45]], align 4 -; AUTO_VEC-NEXT: [[TMP46:%.*]] = fadd reassoc <8 x float> [[VEC_IND_UNR]], [[WIDE_LOAD_EPIL]] -; AUTO_VEC-NEXT: [[TMP47:%.*]] = fadd reassoc <8 x float> [[STEP_ADD_EPIL]], [[WIDE_LOAD5_EPIL]] -; AUTO_VEC-NEXT: [[TMP48:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2_EPIL]], [[WIDE_LOAD6_EPIL]] -; AUTO_VEC-NEXT: [[TMP49:%.*]] = fadd reassoc <8 x float> [[STEP_ADD3_EPIL]], [[WIDE_LOAD7_EPIL]] -; AUTO_VEC-NEXT: [[TMP50:%.*]] = bitcast float* [[TMP38]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP46]], <8 x float>* [[TMP50]], align 4 -; AUTO_VEC-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP40]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP47]], <8 x float>* [[TMP51]], align 4 -; AUTO_VEC-NEXT: [[TMP52:%.*]] = bitcast float* [[TMP42]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP48]], <8 x float>* [[TMP52]], align 4 -; AUTO_VEC-NEXT: [[TMP53:%.*]] = bitcast float* [[TMP44]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP49]], <8 x float>* [[TMP53]], align 4 -; AUTO_VEC-NEXT: br label [[MIDDLE_BLOCK]] +; AUTO_VEC-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4 +; AUTO_VEC-NEXT: [[TMP10:%.*]] = fadd reassoc <8 x float> [[VEC_IND]], [[WIDE_LOAD]] +; AUTO_VEC-NEXT: [[TMP11:%.*]] = fadd reassoc <8 x float> [[STEP_ADD]], [[WIDE_LOAD5]] +; AUTO_VEC-NEXT: [[TMP12:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2]], [[WIDE_LOAD6]] +; AUTO_VEC-NEXT: [[TMP13:%.*]] = fadd reassoc <8 x float> [[STEP_ADD3]], [[WIDE_LOAD7]] +; AUTO_VEC-NEXT: [[TMP14:%.*]] = bitcast float* [[TMP2]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[TMP10]], <8 x float>* [[TMP14]], align 4 +; AUTO_VEC-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP4]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[TMP11]], <8 x float>* [[TMP15]], align 4 +; AUTO_VEC-NEXT: [[TMP16:%.*]] = bitcast float* [[TMP6]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[TMP12]], <8 x float>* [[TMP16]], align 4 +; AUTO_VEC-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP8]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[TMP13]], <8 x float>* [[TMP17]], align 4 +; AUTO_VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; AUTO_VEC-NEXT: [[VEC_IND_NEXT]] = fadd reassoc <8 x float> [[STEP_ADD3]], +; AUTO_VEC-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AUTO_VEC-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; AUTO_VEC: middle.block: ; AUTO_VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; AUTO_VEC-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] @@ -676,13 +423,13 @@ ; AUTO_VEC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] ; AUTO_VEC-NEXT: [[X_012:%.*]] = phi float [ [[ADD3:%.*]], [[FOR_BODY]] ], [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] ; AUTO_VEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[P]], i64 [[INDVARS_IV]] -; AUTO_VEC-NEXT: [[TMP54:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; AUTO_VEC-NEXT: [[ADD:%.*]] = fadd reassoc float [[X_012]], [[TMP54]] +; AUTO_VEC-NEXT: [[TMP19:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; AUTO_VEC-NEXT: [[ADD:%.*]] = fadd reassoc float [[X_012]], [[TMP19]] ; AUTO_VEC-NEXT: store float [[ADD]], float* [[ARRAYIDX]], align 4 ; AUTO_VEC-NEXT: [[ADD3]] = fadd reassoc float [[X_012]], 4.200000e+01 ; AUTO_VEC-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AUTO_VEC-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[TMP0]] -; AUTO_VEC-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; AUTO_VEC-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; entry: %cmp.not11 = icmp eq i32 %N, 0 diff --git a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll --- a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll @@ -21,235 +21,43 @@ ; O1-NEXT: entry: ; O1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0 ; O1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; O1-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; O1-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 -; O1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 +; O1-NEXT: br label [[VECTOR_BODY:%.*]] +; O1: vector.body: +; O1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; O1-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] +; O1-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; O1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; O1-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; O1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] ; O1-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; O1-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; O1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 -; O1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 -; O1-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; O1-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 -; O1-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 -; O1-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 -; O1-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; O1-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 -; O1-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 -; O1-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 -; O1-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; O1-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 -; O1-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 -; O1-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 -; O1-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; O1-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 -; O1-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 -; O1-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 -; O1-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; O1-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 -; O1-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 -; O1-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 -; O1-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; O1-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 -; O1-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 -; O1-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 -; O1-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; O1-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 -; O1-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 -; O1-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 -; O1-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; O1-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 -; O1-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 -; O1-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 -; O1-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; O1-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 -; O1-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 -; O1-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 -; O1-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; O1-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 -; O1-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 -; O1-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 -; O1-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; O1-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 -; O1-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 -; O1-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 -; O1-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; O1-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 -; O1-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 -; O1-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 -; O1-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; O1-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 -; O1-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 -; O1-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 -; O1-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; O1-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; O1-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] -; O1-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 -; O1-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* -; O1-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 -; O1-NEXT: [[TMP78:%.*]] = load i32, i32* [[A]], align 4 -; O1-NEXT: ret i32 [[TMP78]] +; O1-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP4]], align 4 +; O1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; O1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 +; O1-NEXT: br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; O1: for.end: +; O1-NEXT: [[TMP6:%.*]] = load i32, i32* [[A]], align 4 +; O1-NEXT: ret i32 [[TMP6]] ; ; O2-LABEL: @enabled( ; O2-NEXT: entry: ; O2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0 ; O2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; O2-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; O2-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 -; O2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 +; O2-NEXT: br label [[VECTOR_BODY:%.*]] +; O2: vector.body: +; O2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; O2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] +; O2-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; O2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; O2-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] ; O2-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; O2-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; O2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 -; O2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 -; O2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; O2-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 -; O2-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 -; O2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 -; O2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; O2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 -; O2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 -; O2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 -; O2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; O2-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 -; O2-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 -; O2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 -; O2-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; O2-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 -; O2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 -; O2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 -; O2-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; O2-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 -; O2-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 -; O2-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 -; O2-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; O2-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 -; O2-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 -; O2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 -; O2-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; O2-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 -; O2-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 -; O2-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 -; O2-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; O2-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 -; O2-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 -; O2-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 -; O2-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; O2-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 -; O2-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 -; O2-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 -; O2-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; O2-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 -; O2-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 -; O2-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 -; O2-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; O2-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 -; O2-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 -; O2-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 -; O2-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; O2-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 -; O2-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 -; O2-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 -; O2-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; O2-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 -; O2-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 -; O2-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 -; O2-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; O2-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 -; O2-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 -; O2-NEXT: [[TMP78:%.*]] = load i32, i32* [[A]], align 4 -; O2-NEXT: ret i32 [[TMP78]] +; O2-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP4]], align 4 +; O2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; O2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 +; O2-NEXT: br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; O2: for.end: +; O2-NEXT: [[TMP6:%.*]] = load i32, i32* [[A]], align 4 +; O2-NEXT: ret i32 [[TMP6]] ; ; O3-LABEL: @enabled( ; O3-NEXT: entry: @@ -372,586 +180,106 @@ ; O3DEFAULT-NEXT: entry: ; O3DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0 ; O3DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; O3DEFAULT-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; O3DEFAULT-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 -; O3DEFAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 +; O3DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]] +; O3DEFAULT: vector.body: +; O3DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; O3DEFAULT-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] +; O3DEFAULT-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; O3DEFAULT-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; O3DEFAULT-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] ; O3DEFAULT-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; O3DEFAULT-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; O3DEFAULT-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 -; O3DEFAULT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 -; O3DEFAULT-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; O3DEFAULT-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 -; O3DEFAULT-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 -; O3DEFAULT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 -; O3DEFAULT-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; O3DEFAULT-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 -; O3DEFAULT-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 -; O3DEFAULT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 -; O3DEFAULT-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; O3DEFAULT-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 -; O3DEFAULT-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 -; O3DEFAULT-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 -; O3DEFAULT-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; O3DEFAULT-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 -; O3DEFAULT-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 -; O3DEFAULT-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 -; O3DEFAULT-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; O3DEFAULT-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 -; O3DEFAULT-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 -; O3DEFAULT-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 -; O3DEFAULT-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; O3DEFAULT-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 -; O3DEFAULT-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 -; O3DEFAULT-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 -; O3DEFAULT-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; O3DEFAULT-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 -; O3DEFAULT-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 -; O3DEFAULT-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 -; O3DEFAULT-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; O3DEFAULT-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 -; O3DEFAULT-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 -; O3DEFAULT-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 -; O3DEFAULT-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; O3DEFAULT-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 -; O3DEFAULT-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 -; O3DEFAULT-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 -; O3DEFAULT-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; O3DEFAULT-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 -; O3DEFAULT-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 -; O3DEFAULT-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 -; O3DEFAULT-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; O3DEFAULT-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 -; O3DEFAULT-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 -; O3DEFAULT-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 -; O3DEFAULT-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; O3DEFAULT-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 -; O3DEFAULT-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 -; O3DEFAULT-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 -; O3DEFAULT-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; O3DEFAULT-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 -; O3DEFAULT-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 -; O3DEFAULT-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 -; O3DEFAULT-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; O3DEFAULT-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 -; O3DEFAULT-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 -; O3DEFAULT-NEXT: [[TMP78:%.*]] = load i32, i32* [[A]], align 4 -; O3DEFAULT-NEXT: ret i32 [[TMP78]] +; O3DEFAULT-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP4]], align 4 +; O3DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; O3DEFAULT-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 +; O3DEFAULT-NEXT: br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; O3DEFAULT: for.end: +; O3DEFAULT-NEXT: [[TMP6:%.*]] = load i32, i32* [[A]], align 4 +; O3DEFAULT-NEXT: ret i32 [[TMP6]] ; ; Os-LABEL: @enabled( ; Os-NEXT: entry: ; Os-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0 ; Os-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; Os-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; Os-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 -; Os-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 +; Os-NEXT: br label [[VECTOR_BODY:%.*]] +; Os: vector.body: +; Os-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; Os-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] +; Os-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; Os-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; Os-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] ; Os-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; Os-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; Os-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 -; Os-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 -; Os-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; Os-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 -; Os-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 -; Os-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 -; Os-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; Os-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 -; Os-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 -; Os-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 -; Os-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; Os-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 -; Os-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 -; Os-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 -; Os-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; Os-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 -; Os-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 -; Os-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 -; Os-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; Os-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 -; Os-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 -; Os-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 -; Os-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; Os-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 -; Os-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 -; Os-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 -; Os-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; Os-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 -; Os-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 -; Os-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 -; Os-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; Os-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 -; Os-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 -; Os-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 -; Os-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; Os-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 -; Os-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 -; Os-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 -; Os-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; Os-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 -; Os-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 -; Os-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 -; Os-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; Os-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 -; Os-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 -; Os-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 -; Os-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; Os-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 -; Os-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 -; Os-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 -; Os-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; Os-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 -; Os-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 -; Os-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 -; Os-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; Os-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 -; Os-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 -; Os-NEXT: [[TMP78:%.*]] = load i32, i32* [[A]], align 4 -; Os-NEXT: ret i32 [[TMP78]] +; Os-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP4]], align 4 +; Os-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; Os-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 +; Os-NEXT: br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; Os: for.end: +; Os-NEXT: [[TMP6:%.*]] = load i32, i32* [[A]], align 4 +; Os-NEXT: ret i32 [[TMP6]] ; ; Oz-LABEL: @enabled( ; Oz-NEXT: entry: ; Oz-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0 ; Oz-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; Oz-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; Oz-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 -; Oz-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 +; Oz-NEXT: br label [[VECTOR_BODY:%.*]] +; Oz: vector.body: +; Oz-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; Oz-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] +; Oz-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; Oz-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; Oz-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; Oz-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] ; Oz-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; Oz-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; Oz-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 -; Oz-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 -; Oz-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; Oz-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 -; Oz-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 -; Oz-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 -; Oz-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; Oz-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 -; Oz-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 -; Oz-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 -; Oz-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; Oz-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 -; Oz-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 -; Oz-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 -; Oz-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; Oz-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 -; Oz-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 -; Oz-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 -; Oz-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; Oz-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 -; Oz-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 -; Oz-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 -; Oz-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; Oz-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 -; Oz-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 -; Oz-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 -; Oz-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; Oz-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 -; Oz-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 -; Oz-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 -; Oz-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; Oz-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 -; Oz-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 -; Oz-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 -; Oz-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; Oz-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 -; Oz-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 -; Oz-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 -; Oz-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; Oz-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 -; Oz-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 -; Oz-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 -; Oz-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; Oz-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 -; Oz-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 -; Oz-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 -; Oz-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; Oz-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 -; Oz-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 -; Oz-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 -; Oz-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; Oz-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 -; Oz-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 -; Oz-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 -; Oz-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; Oz-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; Oz-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] -; Oz-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 -; Oz-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* -; Oz-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 -; Oz-NEXT: [[TMP78:%.*]] = load i32, i32* [[A]], align 4 -; Oz-NEXT: ret i32 [[TMP78]] +; Oz-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP4]], align 4 +; Oz-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; Oz-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 +; Oz-NEXT: br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; Oz: for.end: +; Oz-NEXT: [[TMP6:%.*]] = load i32, i32* [[A]], align 4 +; Oz-NEXT: ret i32 [[TMP6]] ; ; O1VEC2-LABEL: @enabled( ; O1VEC2-NEXT: entry: ; O1VEC2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0 ; O1VEC2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; O1VEC2-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; O1VEC2-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 -; O1VEC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 +; O1VEC2-NEXT: br label [[VECTOR_BODY:%.*]] +; O1VEC2: vector.body: +; O1VEC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; O1VEC2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] +; O1VEC2-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; O1VEC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; O1VEC2-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; O1VEC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] ; O1VEC2-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; O1VEC2-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; O1VEC2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 -; O1VEC2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 -; O1VEC2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; O1VEC2-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 -; O1VEC2-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 -; O1VEC2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 -; O1VEC2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; O1VEC2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 -; O1VEC2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 -; O1VEC2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 -; O1VEC2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; O1VEC2-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 -; O1VEC2-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 -; O1VEC2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 -; O1VEC2-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; O1VEC2-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 -; O1VEC2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 -; O1VEC2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 -; O1VEC2-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; O1VEC2-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 -; O1VEC2-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 -; O1VEC2-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 -; O1VEC2-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; O1VEC2-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 -; O1VEC2-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 -; O1VEC2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 -; O1VEC2-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; O1VEC2-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 -; O1VEC2-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 -; O1VEC2-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 -; O1VEC2-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; O1VEC2-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 -; O1VEC2-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 -; O1VEC2-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 -; O1VEC2-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; O1VEC2-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 -; O1VEC2-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 -; O1VEC2-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 -; O1VEC2-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; O1VEC2-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 -; O1VEC2-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 -; O1VEC2-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 -; O1VEC2-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; O1VEC2-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 -; O1VEC2-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 -; O1VEC2-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 -; O1VEC2-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; O1VEC2-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 -; O1VEC2-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 -; O1VEC2-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 -; O1VEC2-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; O1VEC2-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 -; O1VEC2-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 -; O1VEC2-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 -; O1VEC2-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; O1VEC2-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; O1VEC2-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] -; O1VEC2-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 -; O1VEC2-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* -; O1VEC2-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 -; O1VEC2-NEXT: [[TMP78:%.*]] = load i32, i32* [[A]], align 4 -; O1VEC2-NEXT: ret i32 [[TMP78]] +; O1VEC2-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP4]], align 4 +; O1VEC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; O1VEC2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 +; O1VEC2-NEXT: br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; O1VEC2: for.end: +; O1VEC2-NEXT: [[TMP6:%.*]] = load i32, i32* [[A]], align 4 +; O1VEC2-NEXT: ret i32 [[TMP6]] ; ; OzVEC2-LABEL: @enabled( ; OzVEC2-NEXT: entry: ; OzVEC2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0 ; OzVEC2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; OzVEC2-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; OzVEC2-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 -; OzVEC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 +; OzVEC2-NEXT: br label [[VECTOR_BODY:%.*]] +; OzVEC2: vector.body: +; OzVEC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; OzVEC2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] +; OzVEC2-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; OzVEC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; OzVEC2-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; OzVEC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] ; OzVEC2-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; OzVEC2-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; OzVEC2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 -; OzVEC2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 -; OzVEC2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; OzVEC2-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 -; OzVEC2-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 -; OzVEC2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 -; OzVEC2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; OzVEC2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 -; OzVEC2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 -; OzVEC2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 -; OzVEC2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; OzVEC2-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 -; OzVEC2-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 -; OzVEC2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 -; OzVEC2-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; OzVEC2-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 -; OzVEC2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 -; OzVEC2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 -; OzVEC2-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; OzVEC2-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 -; OzVEC2-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 -; OzVEC2-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 -; OzVEC2-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; OzVEC2-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 -; OzVEC2-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 -; OzVEC2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 -; OzVEC2-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; OzVEC2-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 -; OzVEC2-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 -; OzVEC2-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 -; OzVEC2-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; OzVEC2-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 -; OzVEC2-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 -; OzVEC2-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 -; OzVEC2-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; OzVEC2-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 -; OzVEC2-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 -; OzVEC2-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 -; OzVEC2-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; OzVEC2-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 -; OzVEC2-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 -; OzVEC2-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 -; OzVEC2-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; OzVEC2-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 -; OzVEC2-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 -; OzVEC2-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 -; OzVEC2-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; OzVEC2-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 -; OzVEC2-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 -; OzVEC2-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 -; OzVEC2-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; OzVEC2-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 -; OzVEC2-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 -; OzVEC2-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 -; OzVEC2-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; OzVEC2-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; OzVEC2-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] -; OzVEC2-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 -; OzVEC2-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* -; OzVEC2-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 -; OzVEC2-NEXT: [[TMP78:%.*]] = load i32, i32* [[A]], align 4 -; OzVEC2-NEXT: ret i32 [[TMP78]] +; OzVEC2-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP4]], align 4 +; OzVEC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; OzVEC2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 +; OzVEC2-NEXT: br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; OzVEC2: for.end: +; OzVEC2-NEXT: [[TMP6:%.*]] = load i32, i32* [[A]], align 4 +; OzVEC2-NEXT: ret i32 [[TMP6]] ; ; O3DIS-LABEL: @enabled( ; O3DIS-NEXT: entry: @@ -1111,118 +439,22 @@ ; O2-NEXT: entry: ; O2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0 ; O2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; O2-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; O2-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 -; O2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 +; O2-NEXT: br label [[VECTOR_BODY:%.*]] +; O2: vector.body: +; O2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; O2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] +; O2-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; O2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; O2-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; O2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] ; O2-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; O2-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; O2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 -; O2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 -; O2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; O2-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 -; O2-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 -; O2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 -; O2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; O2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 -; O2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 -; O2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 -; O2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; O2-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 -; O2-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 -; O2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 -; O2-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; O2-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 -; O2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 -; O2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 -; O2-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; O2-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 -; O2-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 -; O2-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 -; O2-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; O2-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 -; O2-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 -; O2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 -; O2-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; O2-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 -; O2-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 -; O2-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 -; O2-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; O2-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 -; O2-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 -; O2-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 -; O2-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; O2-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 -; O2-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 -; O2-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 -; O2-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; O2-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 -; O2-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 -; O2-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 -; O2-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; O2-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 -; O2-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 -; O2-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 -; O2-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; O2-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 -; O2-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 -; O2-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 -; O2-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; O2-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 -; O2-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 -; O2-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 -; O2-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; O2-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; O2-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] -; O2-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 -; O2-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* -; O2-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 -; O2-NEXT: [[TMP78:%.*]] = load i32, i32* [[A]], align 4 -; O2-NEXT: ret i32 [[TMP78]] +; O2-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP4]], align 4 +; O2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; O2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 +; O2-NEXT: br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; O2: for.end: +; O2-NEXT: [[TMP6:%.*]] = load i32, i32* [[A]], align 4 +; O2-NEXT: ret i32 [[TMP6]] ; ; O3-LABEL: @nopragma( ; O3-NEXT: entry: @@ -1345,235 +577,43 @@ ; O3DEFAULT-NEXT: entry: ; O3DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0 ; O3DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; O3DEFAULT-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; O3DEFAULT-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 -; O3DEFAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 +; O3DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]] +; O3DEFAULT: vector.body: +; O3DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; O3DEFAULT-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] +; O3DEFAULT-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; O3DEFAULT-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; O3DEFAULT-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; O3DEFAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] ; O3DEFAULT-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; O3DEFAULT-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; O3DEFAULT-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 -; O3DEFAULT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 -; O3DEFAULT-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; O3DEFAULT-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 -; O3DEFAULT-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 -; O3DEFAULT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 -; O3DEFAULT-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; O3DEFAULT-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 -; O3DEFAULT-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 -; O3DEFAULT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 -; O3DEFAULT-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; O3DEFAULT-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 -; O3DEFAULT-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 -; O3DEFAULT-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 -; O3DEFAULT-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; O3DEFAULT-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 -; O3DEFAULT-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 -; O3DEFAULT-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 -; O3DEFAULT-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; O3DEFAULT-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 -; O3DEFAULT-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 -; O3DEFAULT-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 -; O3DEFAULT-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; O3DEFAULT-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 -; O3DEFAULT-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 -; O3DEFAULT-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 -; O3DEFAULT-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; O3DEFAULT-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 -; O3DEFAULT-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 -; O3DEFAULT-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 -; O3DEFAULT-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; O3DEFAULT-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 -; O3DEFAULT-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 -; O3DEFAULT-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 -; O3DEFAULT-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; O3DEFAULT-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 -; O3DEFAULT-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 -; O3DEFAULT-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 -; O3DEFAULT-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; O3DEFAULT-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 -; O3DEFAULT-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 -; O3DEFAULT-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 -; O3DEFAULT-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; O3DEFAULT-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 -; O3DEFAULT-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 -; O3DEFAULT-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 -; O3DEFAULT-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; O3DEFAULT-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 -; O3DEFAULT-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 -; O3DEFAULT-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 -; O3DEFAULT-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; O3DEFAULT-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 -; O3DEFAULT-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 -; O3DEFAULT-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 -; O3DEFAULT-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; O3DEFAULT-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; O3DEFAULT-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] -; O3DEFAULT-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 -; O3DEFAULT-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* -; O3DEFAULT-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 -; O3DEFAULT-NEXT: [[TMP78:%.*]] = load i32, i32* [[A]], align 4 -; O3DEFAULT-NEXT: ret i32 [[TMP78]] +; O3DEFAULT-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP4]], align 4 +; O3DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; O3DEFAULT-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 +; O3DEFAULT-NEXT: br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; O3DEFAULT: for.end: +; O3DEFAULT-NEXT: [[TMP6:%.*]] = load i32, i32* [[A]], align 4 +; O3DEFAULT-NEXT: ret i32 [[TMP6]] ; ; Os-LABEL: @nopragma( ; Os-NEXT: entry: ; Os-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i64 0 ; Os-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; Os-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; Os-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 -; Os-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 +; Os-NEXT: br label [[VECTOR_BODY:%.*]] +; Os: vector.body: +; Os-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; Os-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] +; Os-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; Os-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; Os-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; Os-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] ; Os-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; Os-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; Os-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 -; Os-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 -; Os-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; Os-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 -; Os-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 -; Os-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 -; Os-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; Os-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 -; Os-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 -; Os-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 -; Os-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; Os-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 -; Os-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 -; Os-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 -; Os-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; Os-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 -; Os-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 -; Os-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 -; Os-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; Os-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 -; Os-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 -; Os-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 -; Os-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; Os-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 -; Os-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 -; Os-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 -; Os-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; Os-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 -; Os-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 -; Os-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 -; Os-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; Os-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 -; Os-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 -; Os-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 -; Os-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; Os-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 -; Os-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 -; Os-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 -; Os-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; Os-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 -; Os-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 -; Os-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 -; Os-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; Os-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 -; Os-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 -; Os-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 -; Os-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; Os-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 -; Os-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 -; Os-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 -; Os-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; Os-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 -; Os-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 -; Os-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 -; Os-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; Os-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; Os-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] -; Os-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 -; Os-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* -; Os-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 -; Os-NEXT: [[TMP78:%.*]] = load i32, i32* [[A]], align 4 -; Os-NEXT: ret i32 [[TMP78]] +; Os-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP4]], align 4 +; Os-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; Os-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 +; Os-NEXT: br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; Os: for.end: +; Os-NEXT: [[TMP6:%.*]] = load i32, i32* [[A]], align 4 +; Os-NEXT: ret i32 [[TMP6]] ; ; Oz-LABEL: @nopragma( ; Oz-NEXT: entry: @@ -1613,7 +653,7 @@ ; O1VEC2-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP7]], align 4 ; O1VEC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; O1VEC2-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 -; O1VEC2-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; O1VEC2-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; O1VEC2: middle.block: ; O1VEC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 64, 64 ; O1VEC2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1629,7 +669,7 @@ ; O1VEC2-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 ; O1VEC2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; O1VEC2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64 -; O1VEC2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; O1VEC2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; O1VEC2: for.end: ; O1VEC2-NEXT: [[TMP10:%.*]] = load i32, i32* [[A]], align 4 ; O1VEC2-NEXT: ret i32 [[TMP10]] @@ -1655,7 +695,7 @@ ; OzVEC2-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP7]], align 4 ; OzVEC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; OzVEC2-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 -; OzVEC2-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; OzVEC2-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; OzVEC2: middle.block: ; OzVEC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 64, 64 ; OzVEC2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1671,7 +711,7 @@ ; OzVEC2-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 ; OzVEC2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; OzVEC2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64 -; OzVEC2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; OzVEC2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; OzVEC2: for.end: ; OzVEC2-NEXT: [[TMP10:%.*]] = load i32, i32* [[A]], align 4 ; OzVEC2-NEXT: ret i32 [[TMP10]] @@ -1725,7 +765,7 @@ ; O1-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 ; O1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; O1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48 -; O1-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; O1-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; O1: for.end: ; O1-NEXT: [[TMP1:%.*]] = load i32, i32* [[A]], align 4 ; O1-NEXT: ret i32 [[TMP1]] @@ -1742,7 +782,7 @@ ; O2-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 ; O2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; O2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48 -; O2-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; O2-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; O2: for.end: ; O2-NEXT: [[TMP1:%.*]] = load i32, i32* [[A]], align 4 ; O2-NEXT: ret i32 [[TMP1]] @@ -1865,7 +905,7 @@ ; Os-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 ; Os-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; Os-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48 -; Os-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; Os-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; Os: for.end: ; Os-NEXT: [[TMP1:%.*]] = load i32, i32* [[A]], align 4 ; Os-NEXT: ret i32 [[TMP1]] @@ -1882,7 +922,7 @@ ; Oz-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 ; Oz-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; Oz-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48 -; Oz-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; Oz-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; Oz: for.end: ; Oz-NEXT: [[TMP1:%.*]] = load i32, i32* [[A]], align 4 ; Oz-NEXT: ret i32 [[TMP1]] @@ -1899,7 +939,7 @@ ; O1VEC2-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 ; O1VEC2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; O1VEC2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48 -; O1VEC2-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; O1VEC2-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; O1VEC2: for.end: ; O1VEC2-NEXT: [[TMP1:%.*]] = load i32, i32* [[A]], align 4 ; O1VEC2-NEXT: ret i32 [[TMP1]] @@ -1916,7 +956,7 @@ ; OzVEC2-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4 ; OzVEC2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; OzVEC2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48 -; OzVEC2-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; OzVEC2-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; OzVEC2: for.end: ; OzVEC2-NEXT: [[TMP1:%.*]] = load i32, i32* [[A]], align 4 ; OzVEC2-NEXT: ret i32 [[TMP1]] diff --git a/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll b/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll @@ -12,129 +12,23 @@ define void @test_known_trip_count() { ; CHECK-LABEL: @test_known_trip_count( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr @b, align 16 -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <2 x double>, ptr @c, align 16 -; CHECK-NEXT: [[TMP0:%.*]] = fadd <2 x double> [[WIDE_LOAD]], [[WIDE_LOAD3]] -; CHECK-NEXT: store <2 x double> [[TMP0]], ptr @a, align 16 -; CHECK-NEXT: [[WIDE_LOAD_1:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 2), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_1:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 2), align 16 -; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[WIDE_LOAD_1]], [[WIDE_LOAD3_1]] -; CHECK-NEXT: store <2 x double> [[TMP1]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 2), align 16 -; CHECK-NEXT: [[WIDE_LOAD_2:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 4), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_2:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 4), align 16 -; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[WIDE_LOAD_2]], [[WIDE_LOAD3_2]] -; CHECK-NEXT: store <2 x double> [[TMP2]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 4), align 16 -; CHECK-NEXT: [[WIDE_LOAD_3:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 6), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_3:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 6), align 16 -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[WIDE_LOAD_3]], [[WIDE_LOAD3_3]] -; CHECK-NEXT: store <2 x double> [[TMP3]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 6), align 16 -; CHECK-NEXT: [[WIDE_LOAD_4:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 8), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_4:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 8), align 16 -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[WIDE_LOAD_4]], [[WIDE_LOAD3_4]] -; CHECK-NEXT: store <2 x double> [[TMP4]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 8), align 16 -; CHECK-NEXT: [[WIDE_LOAD_5:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 10), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_5:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 10), align 16 -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[WIDE_LOAD_5]], [[WIDE_LOAD3_5]] -; CHECK-NEXT: store <2 x double> [[TMP5]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 10), align 16 -; CHECK-NEXT: [[WIDE_LOAD_6:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 12), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_6:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 12), align 16 -; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[WIDE_LOAD_6]], [[WIDE_LOAD3_6]] -; CHECK-NEXT: store <2 x double> [[TMP6]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 12), align 16 -; CHECK-NEXT: [[WIDE_LOAD_7:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 14), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_7:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 14), align 16 -; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[WIDE_LOAD_7]], [[WIDE_LOAD3_7]] -; CHECK-NEXT: store <2 x double> [[TMP7]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 14), align 16 -; CHECK-NEXT: [[WIDE_LOAD_8:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 16), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_8:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 16), align 16 -; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[WIDE_LOAD_8]], [[WIDE_LOAD3_8]] -; CHECK-NEXT: store <2 x double> [[TMP8]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 16), align 16 -; CHECK-NEXT: [[WIDE_LOAD_9:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 18), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_9:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 18), align 16 -; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[WIDE_LOAD_9]], [[WIDE_LOAD3_9]] -; CHECK-NEXT: store <2 x double> [[TMP9]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 18), align 16 -; CHECK-NEXT: [[WIDE_LOAD_10:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 20), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_10:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 20), align 16 -; CHECK-NEXT: [[TMP10:%.*]] = fadd <2 x double> [[WIDE_LOAD_10]], [[WIDE_LOAD3_10]] -; CHECK-NEXT: store <2 x double> [[TMP10]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 20), align 16 -; CHECK-NEXT: [[WIDE_LOAD_11:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 22), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_11:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 22), align 16 -; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[WIDE_LOAD_11]], [[WIDE_LOAD3_11]] -; CHECK-NEXT: store <2 x double> [[TMP11]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 22), align 16 -; CHECK-NEXT: [[WIDE_LOAD_12:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 24), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_12:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 24), align 16 -; CHECK-NEXT: [[TMP12:%.*]] = fadd <2 x double> [[WIDE_LOAD_12]], [[WIDE_LOAD3_12]] -; CHECK-NEXT: store <2 x double> [[TMP12]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 24), align 16 -; CHECK-NEXT: [[WIDE_LOAD_13:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 26), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_13:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 26), align 16 -; CHECK-NEXT: [[TMP13:%.*]] = fadd <2 x double> [[WIDE_LOAD_13]], [[WIDE_LOAD3_13]] -; CHECK-NEXT: store <2 x double> [[TMP13]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 26), align 16 -; CHECK-NEXT: [[WIDE_LOAD_14:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 28), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_14:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 28), align 16 -; CHECK-NEXT: [[TMP14:%.*]] = fadd <2 x double> [[WIDE_LOAD_14]], [[WIDE_LOAD3_14]] -; CHECK-NEXT: store <2 x double> [[TMP14]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 28), align 16 -; CHECK-NEXT: [[WIDE_LOAD_15:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 30), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_15:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 30), align 16 -; CHECK-NEXT: [[TMP15:%.*]] = fadd <2 x double> [[WIDE_LOAD_15]], [[WIDE_LOAD3_15]] -; CHECK-NEXT: store <2 x double> [[TMP15]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 30), align 16 -; CHECK-NEXT: [[WIDE_LOAD_16:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 32), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_16:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 32), align 16 -; CHECK-NEXT: [[TMP16:%.*]] = fadd <2 x double> [[WIDE_LOAD_16]], [[WIDE_LOAD3_16]] -; CHECK-NEXT: store <2 x double> [[TMP16]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 32), align 16 -; CHECK-NEXT: [[WIDE_LOAD_17:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 34), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_17:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 34), align 16 -; CHECK-NEXT: [[TMP17:%.*]] = fadd <2 x double> [[WIDE_LOAD_17]], [[WIDE_LOAD3_17]] -; CHECK-NEXT: store <2 x double> [[TMP17]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 34), align 16 -; CHECK-NEXT: [[WIDE_LOAD_18:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 36), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_18:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 36), align 16 -; CHECK-NEXT: [[TMP18:%.*]] = fadd <2 x double> [[WIDE_LOAD_18]], [[WIDE_LOAD3_18]] -; CHECK-NEXT: store <2 x double> [[TMP18]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 36), align 16 -; CHECK-NEXT: [[WIDE_LOAD_19:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 38), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_19:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 38), align 16 -; CHECK-NEXT: [[TMP19:%.*]] = fadd <2 x double> [[WIDE_LOAD_19]], [[WIDE_LOAD3_19]] -; CHECK-NEXT: store <2 x double> [[TMP19]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 38), align 16 -; CHECK-NEXT: [[WIDE_LOAD_20:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 40), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_20:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 40), align 16 -; CHECK-NEXT: [[TMP20:%.*]] = fadd <2 x double> [[WIDE_LOAD_20]], [[WIDE_LOAD3_20]] -; CHECK-NEXT: store <2 x double> [[TMP20]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 40), align 16 -; CHECK-NEXT: [[WIDE_LOAD_21:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 42), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_21:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 42), align 16 -; CHECK-NEXT: [[TMP21:%.*]] = fadd <2 x double> [[WIDE_LOAD_21]], [[WIDE_LOAD3_21]] -; CHECK-NEXT: store <2 x double> [[TMP21]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 42), align 16 -; CHECK-NEXT: [[WIDE_LOAD_22:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 44), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_22:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 44), align 16 -; CHECK-NEXT: [[TMP22:%.*]] = fadd <2 x double> [[WIDE_LOAD_22]], [[WIDE_LOAD3_22]] -; CHECK-NEXT: store <2 x double> [[TMP22]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 44), align 16 -; CHECK-NEXT: [[WIDE_LOAD_23:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 46), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_23:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 46), align 16 -; CHECK-NEXT: [[TMP23:%.*]] = fadd <2 x double> [[WIDE_LOAD_23]], [[WIDE_LOAD3_23]] -; CHECK-NEXT: store <2 x double> [[TMP23]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 46), align 16 -; CHECK-NEXT: [[WIDE_LOAD_24:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 48), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_24:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 48), align 16 -; CHECK-NEXT: [[TMP24:%.*]] = fadd <2 x double> [[WIDE_LOAD_24]], [[WIDE_LOAD3_24]] -; CHECK-NEXT: store <2 x double> [[TMP24]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 48), align 16 -; CHECK-NEXT: [[WIDE_LOAD_25:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 50), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_25:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 50), align 16 -; CHECK-NEXT: [[TMP25:%.*]] = fadd <2 x double> [[WIDE_LOAD_25]], [[WIDE_LOAD3_25]] -; CHECK-NEXT: store <2 x double> [[TMP25]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 50), align 16 -; CHECK-NEXT: [[WIDE_LOAD_26:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 52), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_26:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 52), align 16 -; CHECK-NEXT: [[TMP26:%.*]] = fadd <2 x double> [[WIDE_LOAD_26]], [[WIDE_LOAD3_26]] -; CHECK-NEXT: store <2 x double> [[TMP26]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 52), align 16 -; CHECK-NEXT: [[WIDE_LOAD_27:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 54), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_27:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 54), align 16 -; CHECK-NEXT: [[TMP27:%.*]] = fadd <2 x double> [[WIDE_LOAD_27]], [[WIDE_LOAD3_27]] -; CHECK-NEXT: store <2 x double> [[TMP27]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 54), align 16 -; CHECK-NEXT: [[WIDE_LOAD_28:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 56), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_28:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 56), align 16 -; CHECK-NEXT: [[TMP28:%.*]] = fadd <2 x double> [[WIDE_LOAD_28]], [[WIDE_LOAD3_28]] -; CHECK-NEXT: store <2 x double> [[TMP28]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 56), align 16 -; CHECK-NEXT: [[WIDE_LOAD_29:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 1, i64 0), align 16 -; CHECK-NEXT: [[WIDE_LOAD3_29:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 1, i64 0), align 16 -; CHECK-NEXT: [[TMP29:%.*]] = fadd <2 x double> [[WIDE_LOAD_29]], [[WIDE_LOAD3_29]] -; CHECK-NEXT: store <2 x double> [[TMP29]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 1, i64 0), align 16 -; CHECK-NEXT: [[TMP30:%.*]] = load double, ptr getelementptr inbounds ([58 x double], ptr @b, i64 1, i64 2), align 16 -; CHECK-NEXT: [[TMP31:%.*]] = load double, ptr getelementptr inbounds ([58 x double], ptr @c, i64 1, i64 2), align 16 -; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP30]], [[TMP31]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [58 x double], ptr @b, i64 0, i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [58 x double], ptr @c, i64 0, i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <2 x double>, ptr [[TMP1]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[WIDE_LOAD]], [[WIDE_LOAD3]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [58 x double], ptr @a, i64 0, i64 [[INDEX]] +; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[TMP3]], align 16 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 60 +; CHECK-NEXT: br i1 [[TMP4]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP5:%.*]] = load double, ptr getelementptr inbounds ([58 x double], ptr @b, i64 1, i64 2), align 16 +; CHECK-NEXT: [[TMP6:%.*]] = load double, ptr getelementptr inbounds ([58 x double], ptr @c, i64 1, i64 2), align 16 +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP5]], [[TMP6]] ; CHECK-NEXT: store double [[ADD]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 1, i64 2), align 16 ; CHECK-NEXT: ret void ; @@ -176,166 +70,26 @@ ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER7:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292 -; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[N_VEC]], -4 -; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i64 [[TMP0]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP2]], 7 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], 28 -; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]] -; CHECK: vector.ph.new: -; CHECK-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP2]], 9223372036854775800 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_7:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[NITER:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[NITER_NEXT_7:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [58 x double], ptr @b, i64 0, i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP4]], align 16 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, ptr [[TMP4]], i64 2 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x double>, ptr [[TMP5]], align 16 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [58 x double], ptr @c, i64 0, i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <2 x double>, ptr [[TMP6]], align 16 +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [58 x double], ptr @b, i64 0, i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i64 2 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x double>, ptr [[TMP1]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [58 x double], ptr @c, i64 0, i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <2 x double>, ptr [[TMP2]], align 16 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i64 2 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <2 x double>, ptr [[TMP3]], align 16 +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[WIDE_LOAD]], [[WIDE_LOAD5]] +; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[WIDE_LOAD4]], [[WIDE_LOAD6]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [58 x double], ptr @a, i64 0, i64 [[INDEX]] +; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TMP6]], align 16 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i64 2 -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <2 x double>, ptr [[TMP7]], align 16 -; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[WIDE_LOAD]], [[WIDE_LOAD5]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[WIDE_LOAD4]], [[WIDE_LOAD6]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [58 x double], ptr @a, i64 0, i64 [[INDEX]] -; CHECK-NEXT: store <2 x double> [[TMP8]], ptr [[TMP10]], align 16 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, ptr [[TMP10]], i64 2 -; CHECK-NEXT: store <2 x double> [[TMP9]], ptr [[TMP11]], align 16 -; CHECK-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [58 x double], ptr @b, i64 0, i64 [[INDEX_NEXT]] -; CHECK-NEXT: [[WIDE_LOAD_1:%.*]] = load <2 x double>, ptr [[TMP12]], align 16 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, ptr [[TMP12]], i64 2 -; CHECK-NEXT: [[WIDE_LOAD4_1:%.*]] = load <2 x double>, ptr [[TMP13]], align 16 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [58 x double], ptr @c, i64 0, i64 [[INDEX_NEXT]] -; CHECK-NEXT: [[WIDE_LOAD5_1:%.*]] = load <2 x double>, ptr [[TMP14]], align 16 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, ptr [[TMP14]], i64 2 -; CHECK-NEXT: [[WIDE_LOAD6_1:%.*]] = load <2 x double>, ptr [[TMP15]], align 16 -; CHECK-NEXT: [[TMP16:%.*]] = fadd <2 x double> [[WIDE_LOAD_1]], [[WIDE_LOAD5_1]] -; CHECK-NEXT: [[TMP17:%.*]] = fadd <2 x double> [[WIDE_LOAD4_1]], [[WIDE_LOAD6_1]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [58 x double], ptr @a, i64 0, i64 [[INDEX_NEXT]] -; CHECK-NEXT: store <2 x double> [[TMP16]], ptr [[TMP18]], align 16 -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[TMP18]], i64 2 -; CHECK-NEXT: store <2 x double> [[TMP17]], ptr [[TMP19]], align 16 -; CHECK-NEXT: [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [58 x double], ptr @b, i64 0, i64 [[INDEX_NEXT_1]] -; CHECK-NEXT: [[WIDE_LOAD_2:%.*]] = load <2 x double>, ptr [[TMP20]], align 16 -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, ptr [[TMP20]], i64 2 -; CHECK-NEXT: [[WIDE_LOAD4_2:%.*]] = load <2 x double>, ptr [[TMP21]], align 16 -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [58 x double], ptr @c, i64 0, i64 [[INDEX_NEXT_1]] -; CHECK-NEXT: [[WIDE_LOAD5_2:%.*]] = load <2 x double>, ptr [[TMP22]], align 16 -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds double, ptr [[TMP22]], i64 2 -; CHECK-NEXT: [[WIDE_LOAD6_2:%.*]] = load <2 x double>, ptr [[TMP23]], align 16 -; CHECK-NEXT: [[TMP24:%.*]] = fadd <2 x double> [[WIDE_LOAD_2]], [[WIDE_LOAD5_2]] -; CHECK-NEXT: [[TMP25:%.*]] = fadd <2 x double> [[WIDE_LOAD4_2]], [[WIDE_LOAD6_2]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds [58 x double], ptr @a, i64 0, i64 [[INDEX_NEXT_1]] -; CHECK-NEXT: store <2 x double> [[TMP24]], ptr [[TMP26]], align 16 -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds double, ptr [[TMP26]], i64 2 -; CHECK-NEXT: store <2 x double> [[TMP25]], ptr [[TMP27]], align 16 -; CHECK-NEXT: [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX]], 12 -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds [58 x double], ptr @b, i64 0, i64 [[INDEX_NEXT_2]] -; CHECK-NEXT: [[WIDE_LOAD_3:%.*]] = load <2 x double>, ptr [[TMP28]], align 16 -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds double, ptr [[TMP28]], i64 2 -; CHECK-NEXT: [[WIDE_LOAD4_3:%.*]] = load <2 x double>, ptr [[TMP29]], align 16 -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [58 x double], ptr @c, i64 0, i64 [[INDEX_NEXT_2]] -; CHECK-NEXT: [[WIDE_LOAD5_3:%.*]] = load <2 x double>, ptr [[TMP30]], align 16 -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds double, ptr [[TMP30]], i64 2 -; CHECK-NEXT: [[WIDE_LOAD6_3:%.*]] = load <2 x double>, ptr [[TMP31]], align 16 -; CHECK-NEXT: [[TMP32:%.*]] = fadd <2 x double> [[WIDE_LOAD_3]], [[WIDE_LOAD5_3]] -; CHECK-NEXT: [[TMP33:%.*]] = fadd <2 x double> [[WIDE_LOAD4_3]], [[WIDE_LOAD6_3]] -; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds [58 x double], ptr @a, i64 0, i64 [[INDEX_NEXT_2]] -; CHECK-NEXT: store <2 x double> [[TMP32]], ptr [[TMP34]], align 16 -; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds double, ptr [[TMP34]], i64 2 -; CHECK-NEXT: store <2 x double> [[TMP33]], ptr [[TMP35]], align 16 -; CHECK-NEXT: [[INDEX_NEXT_3:%.*]] = or i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds [58 x double], ptr @b, i64 0, i64 [[INDEX_NEXT_3]] -; CHECK-NEXT: [[WIDE_LOAD_4:%.*]] = load <2 x double>, ptr [[TMP36]], align 16 -; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds double, ptr [[TMP36]], i64 2 -; CHECK-NEXT: [[WIDE_LOAD4_4:%.*]] = load <2 x double>, ptr [[TMP37]], align 16 -; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds [58 x double], ptr @c, i64 0, i64 [[INDEX_NEXT_3]] -; CHECK-NEXT: [[WIDE_LOAD5_4:%.*]] = load <2 x double>, ptr [[TMP38]], align 16 -; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds double, ptr [[TMP38]], i64 2 -; CHECK-NEXT: [[WIDE_LOAD6_4:%.*]] = load <2 x double>, ptr [[TMP39]], align 16 -; CHECK-NEXT: [[TMP40:%.*]] = fadd <2 x double> [[WIDE_LOAD_4]], [[WIDE_LOAD5_4]] -; CHECK-NEXT: [[TMP41:%.*]] = fadd <2 x double> [[WIDE_LOAD4_4]], [[WIDE_LOAD6_4]] -; CHECK-NEXT: [[TMP42:%.*]] = getelementptr inbounds [58 x double], ptr @a, i64 0, i64 [[INDEX_NEXT_3]] -; CHECK-NEXT: store <2 x double> [[TMP40]], ptr [[TMP42]], align 16 -; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds double, ptr [[TMP42]], i64 2 -; CHECK-NEXT: store <2 x double> [[TMP41]], ptr [[TMP43]], align 16 -; CHECK-NEXT: [[INDEX_NEXT_4:%.*]] = or i64 [[INDEX]], 20 -; CHECK-NEXT: [[TMP44:%.*]] = getelementptr inbounds [58 x double], ptr @b, i64 0, i64 [[INDEX_NEXT_4]] -; CHECK-NEXT: [[WIDE_LOAD_5:%.*]] = load <2 x double>, ptr [[TMP44]], align 16 -; CHECK-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, ptr [[TMP44]], i64 2 -; CHECK-NEXT: [[WIDE_LOAD4_5:%.*]] = load <2 x double>, ptr [[TMP45]], align 16 -; CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds [58 x double], ptr @c, i64 0, i64 [[INDEX_NEXT_4]] -; CHECK-NEXT: [[WIDE_LOAD5_5:%.*]] = load <2 x double>, ptr [[TMP46]], align 16 -; CHECK-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, ptr [[TMP46]], i64 2 -; CHECK-NEXT: [[WIDE_LOAD6_5:%.*]] = load <2 x double>, ptr [[TMP47]], align 16 -; CHECK-NEXT: [[TMP48:%.*]] = fadd <2 x double> [[WIDE_LOAD_5]], [[WIDE_LOAD5_5]] -; CHECK-NEXT: [[TMP49:%.*]] = fadd <2 x double> [[WIDE_LOAD4_5]], [[WIDE_LOAD6_5]] -; CHECK-NEXT: [[TMP50:%.*]] = getelementptr inbounds [58 x double], ptr @a, i64 0, i64 [[INDEX_NEXT_4]] -; CHECK-NEXT: store <2 x double> [[TMP48]], ptr [[TMP50]], align 16 -; CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds double, ptr [[TMP50]], i64 2 -; CHECK-NEXT: store <2 x double> [[TMP49]], ptr [[TMP51]], align 16 -; CHECK-NEXT: [[INDEX_NEXT_5:%.*]] = or i64 [[INDEX]], 24 -; CHECK-NEXT: [[TMP52:%.*]] = getelementptr inbounds [58 x double], ptr @b, i64 0, i64 [[INDEX_NEXT_5]] -; CHECK-NEXT: [[WIDE_LOAD_6:%.*]] = load <2 x double>, ptr [[TMP52]], align 16 -; CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds double, ptr [[TMP52]], i64 2 -; CHECK-NEXT: [[WIDE_LOAD4_6:%.*]] = load <2 x double>, ptr [[TMP53]], align 16 -; CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds [58 x double], ptr @c, i64 0, i64 [[INDEX_NEXT_5]] -; CHECK-NEXT: [[WIDE_LOAD5_6:%.*]] = load <2 x double>, ptr [[TMP54]], align 16 -; CHECK-NEXT: [[TMP55:%.*]] = getelementptr inbounds double, ptr [[TMP54]], i64 2 -; CHECK-NEXT: [[WIDE_LOAD6_6:%.*]] = load <2 x double>, ptr [[TMP55]], align 16 -; CHECK-NEXT: [[TMP56:%.*]] = fadd <2 x double> [[WIDE_LOAD_6]], [[WIDE_LOAD5_6]] -; CHECK-NEXT: [[TMP57:%.*]] = fadd <2 x double> [[WIDE_LOAD4_6]], [[WIDE_LOAD6_6]] -; CHECK-NEXT: [[TMP58:%.*]] = getelementptr inbounds [58 x double], ptr @a, i64 0, i64 [[INDEX_NEXT_5]] -; CHECK-NEXT: store <2 x double> [[TMP56]], ptr [[TMP58]], align 16 -; CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds double, ptr [[TMP58]], i64 2 -; CHECK-NEXT: store <2 x double> [[TMP57]], ptr [[TMP59]], align 16 -; CHECK-NEXT: [[INDEX_NEXT_6:%.*]] = or i64 [[INDEX]], 28 -; CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds [58 x double], ptr @b, i64 0, i64 [[INDEX_NEXT_6]] -; CHECK-NEXT: [[WIDE_LOAD_7:%.*]] = load <2 x double>, ptr [[TMP60]], align 16 -; CHECK-NEXT: [[TMP61:%.*]] = getelementptr inbounds double, ptr [[TMP60]], i64 2 -; CHECK-NEXT: [[WIDE_LOAD4_7:%.*]] = load <2 x double>, ptr [[TMP61]], align 16 -; CHECK-NEXT: [[TMP62:%.*]] = getelementptr inbounds [58 x double], ptr @c, i64 0, i64 [[INDEX_NEXT_6]] -; CHECK-NEXT: [[WIDE_LOAD5_7:%.*]] = load <2 x double>, ptr [[TMP62]], align 16 -; CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds double, ptr [[TMP62]], i64 2 -; CHECK-NEXT: [[WIDE_LOAD6_7:%.*]] = load <2 x double>, ptr [[TMP63]], align 16 -; CHECK-NEXT: [[TMP64:%.*]] = fadd <2 x double> [[WIDE_LOAD_7]], [[WIDE_LOAD5_7]] -; CHECK-NEXT: [[TMP65:%.*]] = fadd <2 x double> [[WIDE_LOAD4_7]], [[WIDE_LOAD6_7]] -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds [58 x double], ptr @a, i64 0, i64 [[INDEX_NEXT_6]] -; CHECK-NEXT: store <2 x double> [[TMP64]], ptr [[TMP66]], align 16 -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds double, ptr [[TMP66]], i64 2 -; CHECK-NEXT: store <2 x double> [[TMP65]], ptr [[TMP67]], align 16 -; CHECK-NEXT: [[INDEX_NEXT_7]] = add nuw i64 [[INDEX]], 32 -; CHECK-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8 -; CHECK-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]] -; CHECK-NEXT: br i1 [[NITER_NCMP_7]], label [[MIDDLE_BLOCK_UNR_LCSSA]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK: middle.block.unr-lcssa: -; CHECK-NEXT: [[INDEX_UNR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_7]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 -; CHECK-NEXT: br i1 [[LCMP_MOD_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY_EPIL:%.*]] -; CHECK: vector.body.epil: -; CHECK-NEXT: [[INDEX_EPIL:%.*]] = phi i64 [ [[INDEX_NEXT_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[INDEX_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; CHECK-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ [[EPIL_ITER_NEXT:%.*]], [[VECTOR_BODY_EPIL]] ], [ 0, [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds [58 x double], ptr @b, i64 0, i64 [[INDEX_EPIL]] -; CHECK-NEXT: [[WIDE_LOAD_EPIL:%.*]] = load <2 x double>, ptr [[TMP68]], align 16 -; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds double, ptr [[TMP68]], i64 2 -; CHECK-NEXT: [[WIDE_LOAD4_EPIL:%.*]] = load <2 x double>, ptr [[TMP69]], align 16 -; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds [58 x double], ptr @c, i64 0, i64 [[INDEX_EPIL]] -; CHECK-NEXT: [[WIDE_LOAD5_EPIL:%.*]] = load <2 x double>, ptr [[TMP70]], align 16 -; CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds double, ptr [[TMP70]], i64 2 -; CHECK-NEXT: [[WIDE_LOAD6_EPIL:%.*]] = load <2 x double>, ptr [[TMP71]], align 16 -; CHECK-NEXT: [[TMP72:%.*]] = fadd <2 x double> [[WIDE_LOAD_EPIL]], [[WIDE_LOAD5_EPIL]] -; CHECK-NEXT: [[TMP73:%.*]] = fadd <2 x double> [[WIDE_LOAD4_EPIL]], [[WIDE_LOAD6_EPIL]] -; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds [58 x double], ptr @a, i64 0, i64 [[INDEX_EPIL]] -; CHECK-NEXT: store <2 x double> [[TMP72]], ptr [[TMP74]], align 16 -; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds double, ptr [[TMP74]], i64 2 -; CHECK-NEXT: store <2 x double> [[TMP73]], ptr [[TMP75]], align 16 -; CHECK-NEXT: [[INDEX_NEXT_EPIL]] = add nuw i64 [[INDEX_EPIL]], 4 -; CHECK-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1 -; CHECK-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_NEXT]], [[XTRAITER]] -; CHECK-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY_EPIL]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[TMP7]], align 16 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT]], label [[FOR_BODY_PREHEADER7]] @@ -345,15 +99,15 @@ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER7]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [58 x double], ptr @b, i64 0, i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP76:%.*]] = load double, ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load double, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [58 x double], ptr @c, i64 0, i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP77:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 -; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP76]], [[TMP77]] +; CHECK-NEXT: [[TMP10:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP9]], [[TMP10]] ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [58 x double], ptr @a, i64 0, i64 [[INDVARS_IV]] ; CHECK-NEXT: store double [[ADD]], ptr [[ARRAYIDX4]], align 8 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ;