Index: llvm/lib/Transforms/InstCombine/InstructionCombining.cpp =================================================================== --- llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1673,10 +1673,13 @@ // Note that if our source is a gep chain itself then we wait for that // chain to be resolved before we perform this transformation. This // avoids us creating a TON of code in some cases. - if (GEPOperator *SrcGEP = - dyn_cast(Src->getOperand(0))) - if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP)) - return nullptr; // Wait until our source is folded to completion. + // If the source has only one use, then there is no need of waiting for + // the chain to be resolved. + if (!Src->hasOneUse()) { + if (GEPOperator *SrcGEP = dyn_cast(Src->getOperand(0))) + if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP)) + return nullptr; // Wait until our source is folded to completion. + } SmallVector Indices; @@ -1704,8 +1707,22 @@ SimplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP)); // Only do the combine when we are sure the cost after the // merge is never more than that before the merge. - if (Sum == nullptr) - return nullptr; + if (Sum == nullptr) { + if (!LI) + return nullptr; + + // If both GO1 and SO1 are loop-invariant then they can also be + // combined. However, if the first operand of Src Gep is also loop + // invariant, then they shouldn't be combined as LICM will anyway hoist + // them out of the loop and combining will only create extra + // instructions. + Loop *L = LI->getLoopFor(GEP.getParent()); + if (!L || !L->isLoopInvariant(GO1) || !L->isLoopInvariant(SO1) || + L->isLoopInvariant(Src->getOperand(0))) + return nullptr; + + Sum = Builder.CreateAdd(SO1, GO1, PtrOp->getName() + ".sum"); + } // Update the GEP in place if possible. if (Src->getNumOperands() == 2) { Index: llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll @@ -0,0 +1,65 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: norecurse nounwind readonly uwtable +define i32 @foo(i8* nocapture readnone %match, i32 %cur_match, i32 %best_len, i32 %scan_end, i32* nocapture readonly %prev, i32 %limit, i32 %chain_length, i8* nocapture readonly %win, i32 %wmask) { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[IDX_EXT2:%.*]] = zext i32 [[CUR_MATCH:%.*]] to i64 +; CHECK-NEXT: [[ADD_PTR4:%.*]] = getelementptr inbounds i8, i8* [[WIN:%.*]], i64 [[IDX_EXT2]] +; CHECK-NEXT: [[IDX_EXT1:%.*]] = zext i32 [[BEST_LEN:%.*]] to i64 +; CHECK: if.then.lr.ph: +; CHECK-NEXT: br label [[IF_THEN:%.*]] +; CHECK: do.body: +; CHECK-NEXT: [[IDX_EXT:%.*]] = zext i32 [[TMP4:%.*]] to i64 +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[WIN]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[ADD_PTR2_SUM:%.*]] = add nsw i64 [[IDX_EXT1]], -1 +; CHECK-NEXT: [[ADD_PTR3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 [[ADD_PTR2_SUM]] +; +entry: + %idx.ext2 = zext i32 %cur_match to i64 + %add.ptr4 = getelementptr inbounds i8, i8* %win, i64 %idx.ext2 + %idx.ext1 = zext i32 %best_len to i64 + %add.ptr25 = getelementptr inbounds i8, i8* %add.ptr4, i64 %idx.ext1 + %add.ptr36 = getelementptr inbounds i8, i8* %add.ptr25, i64 -1 + %0 = bitcast i8* %add.ptr36 to i32* + %1 = load i32, i32* %0, align 4 + %cmp7 = icmp eq i32 %1, %scan_end + br i1 %cmp7, label %do.end, label %if.then.lr.ph + +if.then.lr.ph: ; preds = %entry + br label %if.then + +do.body: ; preds = %land.lhs.true + %chain_length.addr.0 = phi i32 [ %dec, %land.lhs.true ] + %cur_match.addr.0 = phi i32 [ %4, %land.lhs.true ] + %idx.ext = zext i32 %cur_match.addr.0 to i64 + %add.ptr = getelementptr inbounds i8, i8* %win, i64 %idx.ext + %add.ptr2 = getelementptr inbounds i8, i8* %add.ptr, i64 %idx.ext1 + %add.ptr3 = getelementptr inbounds i8, i8* %add.ptr2, i64 -1 + %2 = bitcast i8* %add.ptr3 to i32* + %3 = load i32, i32* %2, align 4 + %cmp = icmp eq i32 %3, %scan_end + br i1 %cmp, label %do.end, label %if.then + +if.then: ; preds = %if.then.lr.ph, %do.body + %cur_match.addr.09 = phi i32 [ %cur_match, %if.then.lr.ph ], [ %cur_match.addr.0, %do.body ] + %chain_length.addr.08 = phi i32 [ %chain_length, %if.then.lr.ph ], [ %chain_length.addr.0, %do.body ] + %and = and i32 %cur_match.addr.09, %wmask + %idxprom = zext i32 %and to i64 + %arrayidx = getelementptr inbounds i32, i32* %prev, i64 %idxprom + %4 = load i32, i32* %arrayidx, align 4 + %cmp4 = icmp ugt i32 %4, %limit + br i1 %cmp4, label %land.lhs.true, label %do.end + +land.lhs.true: ; preds = %if.then + %dec = add i32 %chain_length.addr.08, -1 + %cmp5 = icmp eq i32 %dec, 0 + br i1 %cmp5, label %do.end, label %do.body + +do.end: ; preds = %do.body, %land.lhs.true, %if.then, %entry + %cont.0 = phi i32 [ 1, %entry ], [ 0, %if.then ], [ 0, %land.lhs.true ], [ 1, %do.body ] + ret i32 %cont.0 +} Index: llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -2430,93 +2430,84 @@ ; AVX1-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* ; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !alias.scope !41 ; AVX1-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -4 -; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i64 -3 -; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; AVX1-NEXT: [[WIDE_LOAD20:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !41 +; AVX1-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -7 +; AVX1-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; AVX1-NEXT: [[WIDE_LOAD20:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4, !alias.scope !41 ; AVX1-NEXT: [[REVERSE21:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD20]], <4 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -8 -; AVX1-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP8]], i64 -3 -; AVX1-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* -; AVX1-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !41 +; AVX1-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -11 +; AVX1-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* +; AVX1-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4, !alias.scope !41 ; AVX1-NEXT: [[REVERSE23:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD22]], <4 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -12 -; AVX1-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP11]], i64 -3 -; AVX1-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* -; AVX1-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !41 +; AVX1-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -15 +; AVX1-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* +; AVX1-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !41 ; AVX1-NEXT: [[REVERSE25:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD24]], <4 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP14:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer -; AVX1-NEXT: [[TMP15:%.*]] = icmp sgt <4 x i32> [[REVERSE21]], zeroinitializer -; AVX1-NEXT: [[TMP16:%.*]] = icmp sgt <4 x i32> [[REVERSE23]], zeroinitializer -; AVX1-NEXT: [[TMP17:%.*]] = icmp sgt <4 x i32> [[REVERSE25]], zeroinitializer -; AVX1-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[OFFSET_IDX]] -; AVX1-NEXT: [[TMP19:%.*]] = getelementptr double, double* [[TMP18]], i64 -3 -; AVX1-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP14]], <4 x i1> undef, <4 x i32> -; AVX1-NEXT: [[TMP20:%.*]] = bitcast double* [[TMP19]] to <4 x double>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP20]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> undef), !alias.scope !44 -; AVX1-NEXT: [[TMP21:%.*]] = getelementptr double, double* [[TMP18]], i64 -4 -; AVX1-NEXT: [[TMP22:%.*]] = getelementptr double, double* [[TMP21]], i64 -3 -; AVX1-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> undef, <4 x i32> +; AVX1-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer +; AVX1-NEXT: [[TMP12:%.*]] = icmp sgt <4 x i32> [[REVERSE21]], zeroinitializer +; AVX1-NEXT: [[TMP13:%.*]] = icmp sgt <4 x i32> [[REVERSE23]], zeroinitializer +; AVX1-NEXT: [[TMP14:%.*]] = icmp sgt <4 x i32> [[REVERSE25]], zeroinitializer +; AVX1-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[OFFSET_IDX]] +; AVX1-NEXT: [[TMP16:%.*]] = getelementptr double, double* [[TMP15]], i64 -3 +; AVX1-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP11]], <4 x i1> undef, <4 x i32> +; AVX1-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>* +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP17]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> undef), !alias.scope !44 +; AVX1-NEXT: [[TMP18:%.*]] = getelementptr double, double* [[TMP15]], i64 -7 +; AVX1-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x i1> [[TMP12]], <4 x i1> undef, <4 x i32> +; AVX1-NEXT: [[TMP19:%.*]] = bitcast double* [[TMP18]] to <4 x double>* +; AVX1-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP19]], i32 8, <4 x i1> [[REVERSE28]], <4 x double> undef), !alias.scope !44 +; AVX1-NEXT: [[TMP20:%.*]] = getelementptr double, double* [[TMP15]], i64 -11 +; AVX1-NEXT: [[REVERSE31:%.*]] = shufflevector <4 x i1> [[TMP13]], <4 x i1> undef, <4 x i32> +; AVX1-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP20]] to <4 x double>* +; AVX1-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP21]], i32 8, <4 x i1> [[REVERSE31]], <4 x double> undef), !alias.scope !44 +; AVX1-NEXT: [[TMP22:%.*]] = getelementptr double, double* [[TMP15]], i64 -15 +; AVX1-NEXT: [[REVERSE34:%.*]] = shufflevector <4 x i1> [[TMP14]], <4 x i1> undef, <4 x i32> ; AVX1-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP22]] to <4 x double>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP23]], i32 8, <4 x i1> [[REVERSE28]], <4 x double> undef), !alias.scope !44 -; AVX1-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[TMP18]], i64 -8 -; AVX1-NEXT: [[TMP25:%.*]] = getelementptr double, double* [[TMP24]], i64 -3 -; AVX1-NEXT: [[REVERSE31:%.*]] = shufflevector <4 x i1> [[TMP16]], <4 x i1> undef, <4 x i32> -; AVX1-NEXT: [[TMP26:%.*]] = bitcast double* [[TMP25]] to <4 x double>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP26]], i32 8, <4 x i1> [[REVERSE31]], <4 x double> undef), !alias.scope !44 -; AVX1-NEXT: [[TMP27:%.*]] = getelementptr double, double* [[TMP18]], i64 -12 -; AVX1-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP27]], i64 -3 -; AVX1-NEXT: [[REVERSE34:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> undef, <4 x i32> -; AVX1-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP29]], i32 8, <4 x i1> [[REVERSE34]], <4 x double> undef), !alias.scope !44 -; AVX1-NEXT: [[TMP30:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], -; AVX1-NEXT: [[TMP31:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD29]], -; AVX1-NEXT: [[TMP32:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD32]], -; AVX1-NEXT: [[TMP33:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD35]], -; AVX1-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[OFFSET_IDX]] -; AVX1-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP34]], i64 -3 +; AVX1-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP23]], i32 8, <4 x i1> [[REVERSE34]], <4 x double> undef), !alias.scope !44 +; AVX1-NEXT: [[TMP24:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], +; AVX1-NEXT: [[TMP25:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD29]], +; AVX1-NEXT: [[TMP26:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD32]], +; AVX1-NEXT: [[TMP27:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD35]], +; AVX1-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[OFFSET_IDX]] +; AVX1-NEXT: [[TMP29:%.*]] = getelementptr double, double* [[TMP28]], i64 -3 +; AVX1-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <4 x double>* +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP24]], <4 x double>* [[TMP30]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48 +; AVX1-NEXT: [[TMP31:%.*]] = getelementptr double, double* [[TMP28]], i64 -7 +; AVX1-NEXT: [[TMP32:%.*]] = bitcast double* [[TMP31]] to <4 x double>* +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP25]], <4 x double>* [[TMP32]], i32 8, <4 x i1> [[REVERSE28]]), !alias.scope !46, !noalias !48 +; AVX1-NEXT: [[TMP33:%.*]] = getelementptr double, double* [[TMP28]], i64 -11 +; AVX1-NEXT: [[TMP34:%.*]] = bitcast double* [[TMP33]] to <4 x double>* +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP26]], <4 x double>* [[TMP34]], i32 8, <4 x i1> [[REVERSE31]]), !alias.scope !46, !noalias !48 +; AVX1-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP28]], i64 -15 ; AVX1-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP30]], <4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48 -; AVX1-NEXT: [[TMP37:%.*]] = getelementptr double, double* [[TMP34]], i64 -4 -; AVX1-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP37]], i64 -3 -; AVX1-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP31]], <4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE28]]), !alias.scope !46, !noalias !48 -; AVX1-NEXT: [[TMP40:%.*]] = getelementptr double, double* [[TMP34]], i64 -8 -; AVX1-NEXT: [[TMP41:%.*]] = getelementptr double, double* [[TMP40]], i64 -3 -; AVX1-NEXT: [[TMP42:%.*]] = bitcast double* [[TMP41]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP32]], <4 x double>* [[TMP42]], i32 8, <4 x i1> [[REVERSE31]]), !alias.scope !46, !noalias !48 -; AVX1-NEXT: [[TMP43:%.*]] = getelementptr double, double* [[TMP34]], i64 -12 -; AVX1-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[TMP43]], i64 -3 -; AVX1-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <4 x double>* -; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP33]], <4 x double>* [[TMP45]], i32 8, <4 x i1> [[REVERSE34]]), !alias.scope !46, !noalias !48 +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP27]], <4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE34]]), !alias.scope !46, !noalias !48 ; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 -; AVX1-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; AVX1-NEXT: br i1 [[TMP46]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !49 +; AVX1-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; AVX1-NEXT: br i1 [[TMP37]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !49 ; AVX1: for.body: ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ], [ 4095, [[ENTRY]] ] ; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP47:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AVX1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP47]], 0 +; AVX1-NEXT: [[TMP38:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP38]], 0 ; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] ; AVX1: if.then: ; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP48:%.*]] = load double, double* [[ARRAYIDX3]], align 8 -; AVX1-NEXT: [[ADD:%.*]] = fadd double [[TMP48]], 5.000000e-01 +; AVX1-NEXT: [[TMP39:%.*]] = load double, double* [[ARRAYIDX3]], align 8 +; AVX1-NEXT: [[ADD:%.*]] = fadd double [[TMP39]], 5.000000e-01 ; AVX1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] ; AVX1-NEXT: store double [[ADD]], double* [[ARRAYIDX5]], align 8 ; AVX1-NEXT: br label [[FOR_INC]] ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nsw i64 [[INDVARS_IV]], -1 ; AVX1-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] -; AVX1-NEXT: [[TMP49:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 -; AVX1-NEXT: [[CMP1_1:%.*]] = icmp sgt i32 [[TMP49]], 0 +; AVX1-NEXT: [[TMP40:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX1-NEXT: [[CMP1_1:%.*]] = icmp sgt i32 [[TMP40]], 0 ; AVX1-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]] ; AVX1: for.end: ; AVX1-NEXT: ret void ; AVX1: if.then.1: ; AVX1-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT]] -; AVX1-NEXT: [[TMP50:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8 -; AVX1-NEXT: [[ADD_1:%.*]] = fadd double [[TMP50]], 5.000000e-01 +; AVX1-NEXT: [[TMP41:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8 +; AVX1-NEXT: [[ADD_1:%.*]] = fadd double [[TMP41]], 5.000000e-01 ; AVX1-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT]] ; AVX1-NEXT: store double [[ADD_1]], double* [[ARRAYIDX5_1]], align 8 ; AVX1-NEXT: br label [[FOR_INC_1]] @@ -2548,119 +2539,110 @@ ; AVX2-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* ; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !alias.scope !41 ; AVX2-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -4 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i64 -3 -; AVX2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD20:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !41 +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -7 +; AVX2-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; AVX2-NEXT: [[WIDE_LOAD20:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4, !alias.scope !41 ; AVX2-NEXT: [[REVERSE21:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD20]], <4 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -8 -; AVX2-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP8]], i64 -3 -; AVX2-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !41 +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -11 +; AVX2-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* +; AVX2-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4, !alias.scope !41 ; AVX2-NEXT: [[REVERSE23:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD22]], <4 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -12 -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP11]], i64 -3 -; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !41 +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -15 +; AVX2-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* +; AVX2-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !41 ; AVX2-NEXT: [[REVERSE25:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD24]], <4 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP14:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer -; AVX2-NEXT: [[TMP15:%.*]] = icmp sgt <4 x i32> [[REVERSE21]], zeroinitializer -; AVX2-NEXT: [[TMP16:%.*]] = icmp sgt <4 x i32> [[REVERSE23]], zeroinitializer -; AVX2-NEXT: [[TMP17:%.*]] = icmp sgt <4 x i32> [[REVERSE25]], zeroinitializer -; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[OFFSET_IDX]] -; AVX2-NEXT: [[TMP19:%.*]] = getelementptr double, double* [[TMP18]], i64 -3 -; AVX2-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP14]], <4 x i1> undef, <4 x i32> -; AVX2-NEXT: [[TMP20:%.*]] = bitcast double* [[TMP19]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP20]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> undef), !alias.scope !44 -; AVX2-NEXT: [[TMP21:%.*]] = getelementptr double, double* [[TMP18]], i64 -4 -; AVX2-NEXT: [[TMP22:%.*]] = getelementptr double, double* [[TMP21]], i64 -3 -; AVX2-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> undef, <4 x i32> +; AVX2-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer +; AVX2-NEXT: [[TMP12:%.*]] = icmp sgt <4 x i32> [[REVERSE21]], zeroinitializer +; AVX2-NEXT: [[TMP13:%.*]] = icmp sgt <4 x i32> [[REVERSE23]], zeroinitializer +; AVX2-NEXT: [[TMP14:%.*]] = icmp sgt <4 x i32> [[REVERSE25]], zeroinitializer +; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[OFFSET_IDX]] +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr double, double* [[TMP15]], i64 -3 +; AVX2-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP11]], <4 x i1> undef, <4 x i32> +; AVX2-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP17]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> undef), !alias.scope !44 +; AVX2-NEXT: [[TMP18:%.*]] = getelementptr double, double* [[TMP15]], i64 -7 +; AVX2-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x i1> [[TMP12]], <4 x i1> undef, <4 x i32> +; AVX2-NEXT: [[TMP19:%.*]] = bitcast double* [[TMP18]] to <4 x double>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP19]], i32 8, <4 x i1> [[REVERSE28]], <4 x double> undef), !alias.scope !44 +; AVX2-NEXT: [[TMP20:%.*]] = getelementptr double, double* [[TMP15]], i64 -11 +; AVX2-NEXT: [[REVERSE31:%.*]] = shufflevector <4 x i1> [[TMP13]], <4 x i1> undef, <4 x i32> +; AVX2-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP20]] to <4 x double>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP21]], i32 8, <4 x i1> [[REVERSE31]], <4 x double> undef), !alias.scope !44 +; AVX2-NEXT: [[TMP22:%.*]] = getelementptr double, double* [[TMP15]], i64 -15 +; AVX2-NEXT: [[REVERSE34:%.*]] = shufflevector <4 x i1> [[TMP14]], <4 x i1> undef, <4 x i32> ; AVX2-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP22]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP23]], i32 8, <4 x i1> [[REVERSE28]], <4 x double> undef), !alias.scope !44 -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[TMP18]], i64 -8 -; AVX2-NEXT: [[TMP25:%.*]] = getelementptr double, double* [[TMP24]], i64 -3 -; AVX2-NEXT: [[REVERSE31:%.*]] = shufflevector <4 x i1> [[TMP16]], <4 x i1> undef, <4 x i32> -; AVX2-NEXT: [[TMP26:%.*]] = bitcast double* [[TMP25]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP26]], i32 8, <4 x i1> [[REVERSE31]], <4 x double> undef), !alias.scope !44 -; AVX2-NEXT: [[TMP27:%.*]] = getelementptr double, double* [[TMP18]], i64 -12 -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP27]], i64 -3 -; AVX2-NEXT: [[REVERSE34:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> undef, <4 x i32> -; AVX2-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP29]], i32 8, <4 x i1> [[REVERSE34]], <4 x double> undef), !alias.scope !44 -; AVX2-NEXT: [[TMP30:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], -; AVX2-NEXT: [[TMP31:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD29]], -; AVX2-NEXT: [[TMP32:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD32]], -; AVX2-NEXT: [[TMP33:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD35]], -; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[OFFSET_IDX]] -; AVX2-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP34]], i64 -3 +; AVX2-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP23]], i32 8, <4 x i1> [[REVERSE34]], <4 x double> undef), !alias.scope !44 +; AVX2-NEXT: [[TMP24:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], +; AVX2-NEXT: [[TMP25:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD29]], +; AVX2-NEXT: [[TMP26:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD32]], +; AVX2-NEXT: [[TMP27:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD35]], +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[OFFSET_IDX]] +; AVX2-NEXT: [[TMP29:%.*]] = getelementptr double, double* [[TMP28]], i64 -3 +; AVX2-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <4 x double>* +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP24]], <4 x double>* [[TMP30]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: [[TMP31:%.*]] = getelementptr double, double* [[TMP28]], i64 -7 +; AVX2-NEXT: [[TMP32:%.*]] = bitcast double* [[TMP31]] to <4 x double>* +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP25]], <4 x double>* [[TMP32]], i32 8, <4 x i1> [[REVERSE28]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: [[TMP33:%.*]] = getelementptr double, double* [[TMP28]], i64 -11 +; AVX2-NEXT: [[TMP34:%.*]] = bitcast double* [[TMP33]] to <4 x double>* +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP26]], <4 x double>* [[TMP34]], i32 8, <4 x i1> [[REVERSE31]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP28]], i64 -15 ; AVX2-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP30]], <4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48 -; AVX2-NEXT: [[TMP37:%.*]] = getelementptr double, double* [[TMP34]], i64 -4 -; AVX2-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP37]], i64 -3 -; AVX2-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP31]], <4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE28]]), !alias.scope !46, !noalias !48 -; AVX2-NEXT: [[TMP40:%.*]] = getelementptr double, double* [[TMP34]], i64 -8 -; AVX2-NEXT: [[TMP41:%.*]] = getelementptr double, double* [[TMP40]], i64 -3 -; AVX2-NEXT: [[TMP42:%.*]] = bitcast double* [[TMP41]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP32]], <4 x double>* [[TMP42]], i32 8, <4 x i1> [[REVERSE31]]), !alias.scope !46, !noalias !48 -; AVX2-NEXT: [[TMP43:%.*]] = getelementptr double, double* [[TMP34]], i64 -12 -; AVX2-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[TMP43]], i64 -3 -; AVX2-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP33]], <4 x double>* [[TMP45]], i32 8, <4 x i1> [[REVERSE34]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP27]], <4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE34]]), !alias.scope !46, !noalias !48 ; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 -; AVX2-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; AVX2-NEXT: br i1 [[TMP46]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !49 +; AVX2-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; AVX2-NEXT: br i1 [[TMP37]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !49 ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ 4095, [[ENTRY]] ] ; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP47:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AVX2-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP47]], 0 +; AVX2-NEXT: [[TMP38:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX2-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP38]], 0 ; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] ; AVX2: if.then: ; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP48:%.*]] = load double, double* [[ARRAYIDX3]], align 8 -; AVX2-NEXT: [[ADD:%.*]] = fadd double [[TMP48]], 5.000000e-01 +; AVX2-NEXT: [[TMP39:%.*]] = load double, double* [[ARRAYIDX3]], align 8 +; AVX2-NEXT: [[ADD:%.*]] = fadd double [[TMP39]], 5.000000e-01 ; AVX2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] ; AVX2-NEXT: store double [[ADD]], double* [[ARRAYIDX5]], align 8 ; AVX2-NEXT: br label [[FOR_INC]] ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nsw i64 [[INDVARS_IV]], -1 ; AVX2-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] -; AVX2-NEXT: [[TMP49:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 -; AVX2-NEXT: [[CMP1_1:%.*]] = icmp sgt i32 [[TMP49]], 0 +; AVX2-NEXT: [[TMP40:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX2-NEXT: [[CMP1_1:%.*]] = icmp sgt i32 [[TMP40]], 0 ; AVX2-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; AVX2: if.then.1: ; AVX2-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT]] -; AVX2-NEXT: [[TMP50:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8 -; AVX2-NEXT: [[ADD_1:%.*]] = fadd double [[TMP50]], 5.000000e-01 +; AVX2-NEXT: [[TMP41:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8 +; AVX2-NEXT: [[ADD_1:%.*]] = fadd double [[TMP41]], 5.000000e-01 ; AVX2-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT]] ; AVX2-NEXT: store double [[ADD_1]], double* [[ARRAYIDX5_1]], align 8 ; AVX2-NEXT: br label [[FOR_INC_1]] ; AVX2: for.inc.1: ; AVX2-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nsw i64 [[INDVARS_IV]], -2 ; AVX2-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]] -; AVX2-NEXT: [[TMP51:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 -; AVX2-NEXT: [[CMP1_2:%.*]] = icmp sgt i32 [[TMP51]], 0 +; AVX2-NEXT: [[TMP42:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; AVX2-NEXT: [[CMP1_2:%.*]] = icmp sgt i32 [[TMP42]], 0 ; AVX2-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]] ; AVX2: if.then.2: ; AVX2-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_1]] -; AVX2-NEXT: [[TMP52:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8 -; AVX2-NEXT: [[ADD_2:%.*]] = fadd double [[TMP52]], 5.000000e-01 +; AVX2-NEXT: [[TMP43:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8 +; AVX2-NEXT: [[ADD_2:%.*]] = fadd double [[TMP43]], 5.000000e-01 ; AVX2-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_1]] ; AVX2-NEXT: store double [[ADD_2]], double* [[ARRAYIDX5_2]], align 8 ; AVX2-NEXT: br label [[FOR_INC_2]] ; AVX2: for.inc.2: ; AVX2-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nsw i64 [[INDVARS_IV]], -3 ; AVX2-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]] -; AVX2-NEXT: [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 -; AVX2-NEXT: [[CMP1_3:%.*]] = icmp sgt i32 [[TMP53]], 0 +; AVX2-NEXT: [[TMP44:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; AVX2-NEXT: [[CMP1_3:%.*]] = icmp sgt i32 [[TMP44]], 0 ; AVX2-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]] ; AVX2: if.then.3: ; AVX2-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_2]] -; AVX2-NEXT: [[TMP54:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8 -; AVX2-NEXT: [[ADD_3:%.*]] = fadd double [[TMP54]], 5.000000e-01 +; AVX2-NEXT: [[TMP45:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8 +; AVX2-NEXT: [[ADD_3:%.*]] = fadd double [[TMP45]], 5.000000e-01 ; AVX2-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_2]] ; AVX2-NEXT: store double [[ADD_3]], double* [[ARRAYIDX5_3]], align 8 ; AVX2-NEXT: br label [[FOR_INC_3]] @@ -2692,119 +2674,110 @@ ; AVX512-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>* ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP4]], align 4, !alias.scope !53 ; AVX512-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD]], <8 x i32> undef, <8 x i32> -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -8 -; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i64 -7 -; AVX512-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD20:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !53 +; AVX512-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -15 +; AVX512-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>* +; AVX512-NEXT: [[WIDE_LOAD20:%.*]] = load <8 x i32>, <8 x i32>* [[TMP6]], align 4, !alias.scope !53 ; AVX512-NEXT: [[REVERSE21:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD20]], <8 x i32> undef, <8 x i32> -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -16 -; AVX512-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP8]], i64 -7 -; AVX512-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4, !alias.scope !53 +; AVX512-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -23 +; AVX512-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <8 x i32>* +; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP8]], align 4, !alias.scope !53 ; AVX512-NEXT: [[REVERSE23:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD22]], <8 x i32> undef, <8 x i32> -; AVX512-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -24 -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP11]], i64 -7 -; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !53 +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -31 +; AVX512-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>* +; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4, !alias.scope !53 ; AVX512-NEXT: [[REVERSE25:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD24]], <8 x i32> undef, <8 x i32> -; AVX512-NEXT: [[TMP14:%.*]] = icmp sgt <8 x i32> [[REVERSE]], zeroinitializer -; AVX512-NEXT: [[TMP15:%.*]] = icmp sgt <8 x i32> [[REVERSE21]], zeroinitializer -; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <8 x i32> [[REVERSE23]], zeroinitializer -; AVX512-NEXT: [[TMP17:%.*]] = icmp sgt <8 x i32> [[REVERSE25]], zeroinitializer -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[OFFSET_IDX]] -; AVX512-NEXT: [[TMP19:%.*]] = getelementptr double, double* [[TMP18]], i64 -7 -; AVX512-NEXT: [[REVERSE26:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> undef, <8 x i32> -; AVX512-NEXT: [[TMP20:%.*]] = bitcast double* [[TMP19]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP20]], i32 8, <8 x i1> [[REVERSE26]], <8 x double> undef), !alias.scope !56 -; AVX512-NEXT: [[TMP21:%.*]] = getelementptr double, double* [[TMP18]], i64 -8 -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr double, double* [[TMP21]], i64 -7 -; AVX512-NEXT: [[REVERSE28:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> undef, <8 x i32> +; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <8 x i32> [[REVERSE]], zeroinitializer +; AVX512-NEXT: [[TMP12:%.*]] = icmp sgt <8 x i32> [[REVERSE21]], zeroinitializer +; AVX512-NEXT: [[TMP13:%.*]] = icmp sgt <8 x i32> [[REVERSE23]], zeroinitializer +; AVX512-NEXT: [[TMP14:%.*]] = icmp sgt <8 x i32> [[REVERSE25]], zeroinitializer +; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[OFFSET_IDX]] +; AVX512-NEXT: [[TMP16:%.*]] = getelementptr double, double* [[TMP15]], i64 -7 +; AVX512-NEXT: [[REVERSE26:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> undef, <8 x i32> +; AVX512-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP16]] to <8 x double>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP17]], i32 8, <8 x i1> [[REVERSE26]], <8 x double> undef), !alias.scope !56 +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr double, double* [[TMP15]], i64 -15 +; AVX512-NEXT: [[REVERSE28:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> undef, <8 x i32> +; AVX512-NEXT: [[TMP19:%.*]] = bitcast double* [[TMP18]] to <8 x double>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP19]], i32 8, <8 x i1> [[REVERSE28]], <8 x double> undef), !alias.scope !56 +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr double, double* [[TMP15]], i64 -23 +; AVX512-NEXT: [[REVERSE31:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> undef, <8 x i32> +; AVX512-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP20]] to <8 x double>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP21]], i32 8, <8 x i1> [[REVERSE31]], <8 x double> undef), !alias.scope !56 +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr double, double* [[TMP15]], i64 -31 +; AVX512-NEXT: [[REVERSE34:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> undef, <8 x i32> ; AVX512-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP22]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP23]], i32 8, <8 x i1> [[REVERSE28]], <8 x double> undef), !alias.scope !56 -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[TMP18]], i64 -16 -; AVX512-NEXT: [[TMP25:%.*]] = getelementptr double, double* [[TMP24]], i64 -7 -; AVX512-NEXT: [[REVERSE31:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> undef, <8 x i32> -; AVX512-NEXT: [[TMP26:%.*]] = bitcast double* [[TMP25]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP26]], i32 8, <8 x i1> [[REVERSE31]], <8 x double> undef), !alias.scope !56 -; AVX512-NEXT: [[TMP27:%.*]] = getelementptr double, double* [[TMP18]], i64 -24 -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP27]], i64 -7 -; AVX512-NEXT: [[REVERSE34:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> undef, <8 x i32> -; AVX512-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP29]], i32 8, <8 x i1> [[REVERSE34]], <8 x double> undef), !alias.scope !56 -; AVX512-NEXT: [[TMP30:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD]], -; AVX512-NEXT: [[TMP31:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD29]], -; AVX512-NEXT: [[TMP32:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD32]], -; AVX512-NEXT: [[TMP33:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD35]], -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[OFFSET_IDX]] -; AVX512-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP34]], i64 -7 +; AVX512-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP23]], i32 8, <8 x i1> [[REVERSE34]], <8 x double> undef), !alias.scope !56 +; AVX512-NEXT: [[TMP24:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD]], +; AVX512-NEXT: [[TMP25:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD29]], +; AVX512-NEXT: [[TMP26:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD32]], +; AVX512-NEXT: [[TMP27:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD35]], +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[OFFSET_IDX]] +; AVX512-NEXT: [[TMP29:%.*]] = getelementptr double, double* [[TMP28]], i64 -7 +; AVX512-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP24]], <8 x double>* [[TMP30]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !58, !noalias !60 +; AVX512-NEXT: [[TMP31:%.*]] = getelementptr double, double* [[TMP28]], i64 -15 +; AVX512-NEXT: [[TMP32:%.*]] = bitcast double* [[TMP31]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP25]], <8 x double>* [[TMP32]], i32 8, <8 x i1> [[REVERSE28]]), !alias.scope !58, !noalias !60 +; AVX512-NEXT: [[TMP33:%.*]] = getelementptr double, double* [[TMP28]], i64 -23 +; AVX512-NEXT: [[TMP34:%.*]] = bitcast double* [[TMP33]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP26]], <8 x double>* [[TMP34]], i32 8, <8 x i1> [[REVERSE31]]), !alias.scope !58, !noalias !60 +; AVX512-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP28]], i64 -31 ; AVX512-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP30]], <8 x double>* [[TMP36]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !58, !noalias !60 -; AVX512-NEXT: [[TMP37:%.*]] = getelementptr double, double* [[TMP34]], i64 -8 -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP37]], i64 -7 -; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP31]], <8 x double>* [[TMP39]], i32 8, <8 x i1> [[REVERSE28]]), !alias.scope !58, !noalias !60 -; AVX512-NEXT: [[TMP40:%.*]] = getelementptr double, double* [[TMP34]], i64 -16 -; AVX512-NEXT: [[TMP41:%.*]] = getelementptr double, double* [[TMP40]], i64 -7 -; AVX512-NEXT: [[TMP42:%.*]] = bitcast double* [[TMP41]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP32]], <8 x double>* [[TMP42]], i32 8, <8 x i1> [[REVERSE31]]), !alias.scope !58, !noalias !60 -; AVX512-NEXT: [[TMP43:%.*]] = getelementptr double, double* [[TMP34]], i64 -24 -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[TMP43]], i64 -7 -; AVX512-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP33]], <8 x double>* [[TMP45]], i32 8, <8 x i1> [[REVERSE34]]), !alias.scope !58, !noalias !60 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP27]], <8 x double>* [[TMP36]], i32 8, <8 x i1> [[REVERSE34]]), !alias.scope !58, !noalias !60 ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 -; AVX512-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; AVX512-NEXT: br i1 [[TMP46]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !61 +; AVX512-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; AVX512-NEXT: br i1 [[TMP37]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !61 ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ 4095, [[ENTRY]] ] ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP47:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP47]], 0 +; AVX512-NEXT: [[TMP38:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP38]], 0 ; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] ; AVX512: if.then: ; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP48:%.*]] = load double, double* [[ARRAYIDX3]], align 8 -; AVX512-NEXT: [[ADD:%.*]] = fadd double [[TMP48]], 5.000000e-01 +; AVX512-NEXT: [[TMP39:%.*]] = load double, double* [[ARRAYIDX3]], align 8 +; AVX512-NEXT: [[ADD:%.*]] = fadd double [[TMP39]], 5.000000e-01 ; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] ; AVX512-NEXT: store double [[ADD]], double* [[ARRAYIDX5]], align 8 ; AVX512-NEXT: br label [[FOR_INC]] ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nsw i64 [[INDVARS_IV]], -1 ; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] -; AVX512-NEXT: [[TMP49:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 -; AVX512-NEXT: [[CMP1_1:%.*]] = icmp sgt i32 [[TMP49]], 0 +; AVX512-NEXT: [[TMP40:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX512-NEXT: [[CMP1_1:%.*]] = icmp sgt i32 [[TMP40]], 0 ; AVX512-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; AVX512: if.then.1: ; AVX512-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT]] -; AVX512-NEXT: [[TMP50:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8 -; AVX512-NEXT: [[ADD_1:%.*]] = fadd double [[TMP50]], 5.000000e-01 +; AVX512-NEXT: [[TMP41:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8 +; AVX512-NEXT: [[ADD_1:%.*]] = fadd double [[TMP41]], 5.000000e-01 ; AVX512-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT]] ; AVX512-NEXT: store double [[ADD_1]], double* [[ARRAYIDX5_1]], align 8 ; AVX512-NEXT: br label [[FOR_INC_1]] ; AVX512: for.inc.1: ; AVX512-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nsw i64 [[INDVARS_IV]], -2 ; AVX512-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]] -; AVX512-NEXT: [[TMP51:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 -; AVX512-NEXT: [[CMP1_2:%.*]] = icmp sgt i32 [[TMP51]], 0 +; AVX512-NEXT: [[TMP42:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; AVX512-NEXT: [[CMP1_2:%.*]] = icmp sgt i32 [[TMP42]], 0 ; AVX512-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]] ; AVX512: if.then.2: ; AVX512-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_1]] -; AVX512-NEXT: [[TMP52:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8 -; AVX512-NEXT: [[ADD_2:%.*]] = fadd double [[TMP52]], 5.000000e-01 +; AVX512-NEXT: [[TMP43:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8 +; AVX512-NEXT: [[ADD_2:%.*]] = fadd double [[TMP43]], 5.000000e-01 ; AVX512-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_1]] ; AVX512-NEXT: store double [[ADD_2]], double* [[ARRAYIDX5_2]], align 8 ; AVX512-NEXT: br label [[FOR_INC_2]] ; AVX512: for.inc.2: ; AVX512-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nsw i64 [[INDVARS_IV]], -3 ; AVX512-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]] -; AVX512-NEXT: [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 -; AVX512-NEXT: [[CMP1_3:%.*]] = icmp sgt i32 [[TMP53]], 0 +; AVX512-NEXT: [[TMP44:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; AVX512-NEXT: [[CMP1_3:%.*]] = icmp sgt i32 [[TMP44]], 0 ; AVX512-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]] ; AVX512: if.then.3: ; AVX512-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_2]] -; AVX512-NEXT: [[TMP54:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8 -; AVX512-NEXT: [[ADD_3:%.*]] = fadd double [[TMP54]], 5.000000e-01 +; AVX512-NEXT: [[TMP45:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8 +; AVX512-NEXT: [[ADD_3:%.*]] = fadd double [[TMP45]], 5.000000e-01 ; AVX512-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_2]] ; AVX512-NEXT: store double [[ADD_3]], double* [[ARRAYIDX5_3]], align 8 ; AVX512-NEXT: br label [[FOR_INC_3]]