diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -1864,7 +1864,7 @@ } // Eliminate redundant IV cycles. - NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts); + NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts, TTI); // Try to convert exit conditions to unsigned // Note: Handles invalidation internally if needed. diff --git a/llvm/test/Transforms/IndVarSimplify/X86/pr27133.ll b/llvm/test/Transforms/IndVarSimplify/X86/pr27133.ll --- a/llvm/test/Transforms/IndVarSimplify/X86/pr27133.ll +++ b/llvm/test/Transforms/IndVarSimplify/X86/pr27133.ll @@ -9,11 +9,11 @@ ; CHECK-NEXT: br label [[FOR_COND:%.*]] ; CHECK: for.cond: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[C_0:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[INDVARS1:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: invoke void @fn1(i64 [[INDVARS_IV]]) ; CHECK-NEXT: to label [[FOR_INC]] unwind label [[CATCH_DISPATCH:%.*]] ; CHECK: catch.dispatch: -; CHECK-NEXT: [[C_0_LCSSA:%.*]] = phi i32 [ [[C_0]], [[FOR_COND]] ] +; CHECK-NEXT: [[C_0_LCSSA:%.*]] = phi i32 [ [[INDVARS1]], [[FOR_COND]] ] ; CHECK-NEXT: [[TMP0:%.*]] = catchswitch within none [label %catch] unwind to caller ; CHECK: catch: ; CHECK-NEXT: [[TMP1:%.*]] = catchpad within [[TMP0]] [i8* null, i32 64, i8* null] @@ -22,7 +22,6 @@ ; CHECK-NEXT: ret i32 [[C_0_LCSSA]] ; CHECK: for.inc: ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[C_0]], 1 ; CHECK-NEXT: br label [[FOR_COND]] ; entry: diff --git a/llvm/test/Transforms/IndVarSimplify/widen-loop-comp.ll b/llvm/test/Transforms/IndVarSimplify/widen-loop-comp.ll --- a/llvm/test/Transforms/IndVarSimplify/widen-loop-comp.ll +++ b/llvm/test/Transforms/IndVarSimplify/widen-loop-comp.ll @@ -103,17 +103,17 @@ ; CHECK: for.cond1.preheader.preheader: ; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]] ; CHECK: for.cond1.preheader.us: -; CHECK-NEXT: [[INDVARS_IV2:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ], [ [[INDVARS_IV_NEXT3:%.*]], [[FOR_INC13_US:%.*]] ] +; CHECK-NEXT: [[INDVARS_IV3:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ], [ [[INDVARS_IV_NEXT4:%.*]], [[FOR_INC13_US:%.*]] ] ; CHECK-NEXT: br i1 true, label [[FOR_BODY4_LR_PH_US:%.*]], label [[FOR_INC13_US]] ; CHECK: for.inc13.us.loopexit: ; CHECK-NEXT: br label [[FOR_INC13_US]] ; CHECK: for.inc13.us: -; CHECK-NEXT: [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV2]], 1 -; CHECK-NEXT: [[EXITCOND4:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT3]], 4 -; CHECK-NEXT: br i1 [[EXITCOND4]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_END_LOOPEXIT1:%.*]] +; CHECK-NEXT: [[INDVARS_IV_NEXT4]] = add nuw nsw i64 [[INDVARS_IV3]], 1 +; CHECK-NEXT: [[EXITCOND6:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT4]], 4 +; CHECK-NEXT: br i1 [[EXITCOND6]], label [[FOR_COND1_PREHEADER_US]], label [[FOR_END_LOOPEXIT1:%.*]] ; CHECK: for.body4.us: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY4_LR_PH_US]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY4_US:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX6_US:%.*]] = getelementptr inbounds [8 x i8], [8 x i8]* [[A:%.*]], i64 [[INDVARS_IV2]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[ARRAYIDX6_US:%.*]] = getelementptr inbounds [8 x i8], [8 x i8]* [[A:%.*]], i64 [[INDVARS_IV3]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX6_US]], align 1 ; CHECK-NEXT: [[IDXPROM7_US:%.*]] = zext i8 [[TMP0]] to i64 ; CHECK-NEXT: [[ARRAYIDX8_US:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[IDXPROM7_US]] @@ -549,6 +549,7 @@ ; CHECK: loop: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ [[TMP0]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[INDVARS_IV]], -1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[INDVARS_IV]], 0 ; CHECK-NEXT: br i1 [[COND]], label [[EXIT:%.*]], label [[BACKEDGE]] ; CHECK: backedge: @@ -556,7 +557,6 @@ ; CHECK-NEXT: store i32 1, i32* [[STORE_ADDR]], align 4 ; CHECK-NEXT: [[STOP:%.*]] = load i32, i32* [[Q:%.*]], align 4 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i32 [[STOP]], 0 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 ; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[FAILURE:%.*]] ; CHECK: exit: ; CHECK-NEXT: ret i32 0 @@ -599,11 +599,11 @@ ; CHECK-NEXT: br i1 [[COND]], label [[EXIT:%.*]], label [[BACKEDGE]] ; CHECK: backedge: ; CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[INDVARS_IV]], -1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 ; CHECK-NEXT: [[STORE_ADDR:%.*]] = getelementptr i32, i32* [[P:%.*]], i64 [[TMP1]] ; CHECK-NEXT: store i32 1, i32* [[STORE_ADDR]], align 4 ; CHECK-NEXT: [[STOP:%.*]] = load i32, i32* [[Q:%.*]], align 4 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i32 [[STOP]], 0 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 ; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[FAILURE:%.*]] ; CHECK: exit: ; CHECK-NEXT: ret i32 0 @@ -862,8 +862,8 @@ ; CHECK-NEXT: [[TMP3:%.*]] = add nsw i64 [[INDVARS_IV]], -1 ; CHECK-NEXT: br i1 [[COND]], label [[EXIT:%.*]], label [[GUARDED:%.*]] ; CHECK: guarded: -; CHECK-NEXT: [[ICMP_USER_WIDE4:%.*]] = icmp ult i64 [[TMP1]], [[TMP2]] -; CHECK-NEXT: br i1 [[ICMP_USER_WIDE4]], label [[BACKEDGE]], label [[SIDE_EXIT:%.*]] +; CHECK-NEXT: [[ICMP_USER_WIDE5:%.*]] = icmp ult i64 [[TMP1]], [[TMP2]] +; CHECK-NEXT: br i1 [[ICMP_USER_WIDE5]], label [[BACKEDGE]], label [[SIDE_EXIT:%.*]] ; CHECK: backedge: ; CHECK-NEXT: [[STORE_ADDR:%.*]] = getelementptr i32, i32* [[P:%.*]], i64 [[TMP3]] ; CHECK-NEXT: store i32 1, i32* [[STORE_ADDR]], align 4 @@ -1266,13 +1266,13 @@ ; CHECK: loop: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ [[TMP0]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[INDVARS_IV]], -1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 ; CHECK-NEXT: [[COND_1:%.*]] = icmp eq i64 [[INDVARS_IV]], 0 ; CHECK-NEXT: br i1 [[COND_1]], label [[EXIT:%.*]], label [[BACKEDGE]] ; CHECK: backedge: ; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 [[TMP1]] ; CHECK-NEXT: [[LOADED:%.*]] = load atomic i32, i32* [[ADDR]] unordered, align 4 ; CHECK-NEXT: [[COND_2:%.*]] = icmp eq i32 [[LOADED]], 0 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 ; CHECK-NEXT: br i1 [[COND_2]], label [[FAILURE:%.*]], label [[LOOP]] ; CHECK: exit: ; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 -1 to i32 @@ -1312,10 +1312,9 @@ ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: call void @bar(i32 [[IV_NEXT]]) +; CHECK-NEXT: [[INDVARS2:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: call void @bar(i32 [[INDVARS2]]) ; CHECK-NEXT: call void @foo(i64 [[INDVARS_IV]]) ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[INDVARS_IV]], 1000 ; CHECK-NEXT: br i1 [[LOOP_COND]], label [[EXIT:%.*]], label [[LOOP]] @@ -1412,11 +1411,10 @@ ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[VAL:%.*]] = phi i32 [ [[VAL_INC:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[VAL_INC]] = add nuw nsw i32 [[VAL]], 1 -; CHECK-NEXT: store i32 [[VAL_INC]], i32* [[PTR]], align 4 -; CHECK-NEXT: call void @foo(i64 [[INDVARS_IV]]) ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[INDVARS:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: store i32 [[INDVARS]], i32* [[PTR]], align 4 +; CHECK-NEXT: call void @foo(i64 [[INDVARS_IV]]) ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp eq i64 [[INDVARS_IV]], 1000 ; CHECK-NEXT: br i1 [[LOOP_COND]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll --- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll @@ -25,22 +25,22 @@ ; AVX512-NEXT: iter.check: ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: -; AVX512-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX6]] +; AVX512-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX7]] ; AVX512-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4 ; AVX512-NEXT: [[TMP2:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD]], zeroinitializer -; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[INDEX:%.*]], i64 [[INDEX6]] +; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[INDEX:%.*]], i64 [[INDEX7]] ; AVX512-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP4]], i32 4, <16 x i1> [[TMP2]], <16 x i32> poison) ; AVX512-NEXT: [[TMP5:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD]] to <16 x i64> ; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <16 x i64> [[TMP5]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP6]], i32 4, <16 x i1> [[TMP2]], <16 x float> undef) ; AVX512-NEXT: [[TMP7:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER]], -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDEX6]] +; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDEX7]] ; AVX512-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <16 x float>* ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP7]], <16 x float>* [[TMP9]], i32 4, <16 x i1> [[TMP2]]) -; AVX512-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX6]], 16 +; AVX512-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX7]], 16 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]] ; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4 @@ -55,7 +55,7 @@ ; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT]] ; AVX512-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <16 x float>* ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP17]], <16 x float>* [[TMP19]], i32 4, <16 x i1> [[TMP12]]) -; AVX512-NEXT: [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX6]], 32 +; AVX512-NEXT: [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX7]], 32 ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_1]] ; AVX512-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD_2:%.*]] = load <16 x i32>, <16 x i32>* [[TMP21]], align 4 @@ -70,7 +70,7 @@ ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_1]] ; AVX512-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <16 x float>* ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP27]], <16 x float>* [[TMP29]], i32 4, <16 x i1> [[TMP22]]) -; AVX512-NEXT: [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX6]], 48 +; AVX512-NEXT: [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX7]], 48 ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_2]] ; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD_3:%.*]] = load <16 x i32>, <16 x i32>* [[TMP31]], align 4 @@ -85,7 +85,7 @@ ; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_2]] ; AVX512-NEXT: [[TMP39:%.*]] = bitcast float* [[TMP38]] to <16 x float>* ; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP37]], <16 x float>* [[TMP39]], i32 4, <16 x i1> [[TMP32]]) -; AVX512-NEXT: [[INDEX_NEXT_3]] = add nuw nsw i64 [[INDEX6]], 64 +; AVX512-NEXT: [[INDEX_NEXT_3]] = add nuw nsw i64 [[INDEX7]], 64 ; AVX512-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT_3]], 4096 ; AVX512-NEXT: br i1 [[TMP40]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AVX512: for.end: @@ -95,52 +95,52 @@ ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX6]] +; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX7]] ; FVW2-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>* ; FVW2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 ; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 ; FVW2-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>* -; FVW2-NEXT: [[WIDE_LOAD7:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4 +; FVW2-NEXT: [[WIDE_LOAD8:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4 ; FVW2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4 ; FVW2-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>* -; FVW2-NEXT: [[WIDE_LOAD8:%.*]] = load <2 x i32>, <2 x i32>* [[TMP5]], align 4 +; FVW2-NEXT: [[WIDE_LOAD9:%.*]] = load <2 x i32>, <2 x i32>* [[TMP5]], align 4 ; FVW2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 ; FVW2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>* -; FVW2-NEXT: [[WIDE_LOAD9:%.*]] = load <2 x i32>, <2 x i32>* [[TMP7]], align 4 +; FVW2-NEXT: [[WIDE_LOAD10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP7]], align 4 ; FVW2-NEXT: [[TMP8:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer -; FVW2-NEXT: [[TMP9:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD7]], zeroinitializer -; FVW2-NEXT: [[TMP10:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD8]], zeroinitializer -; FVW2-NEXT: [[TMP11:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD9]], zeroinitializer -; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[INDEX:%.*]], i64 [[INDEX6]] +; FVW2-NEXT: [[TMP9:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD8]], zeroinitializer +; FVW2-NEXT: [[TMP10:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD9]], zeroinitializer +; FVW2-NEXT: [[TMP11:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD10]], zeroinitializer +; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[INDEX:%.*]], i64 [[INDEX7]] ; FVW2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>* ; FVW2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP13]], i32 4, <2 x i1> [[TMP8]], <2 x i32> poison) ; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 2 ; FVW2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>* -; FVW2-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP15]], i32 4, <2 x i1> [[TMP9]], <2 x i32> poison) +; FVW2-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP15]], i32 4, <2 x i1> [[TMP9]], <2 x i32> poison) ; FVW2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 4 ; FVW2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>* -; FVW2-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP17]], i32 4, <2 x i1> [[TMP10]], <2 x i32> poison) +; FVW2-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP17]], i32 4, <2 x i1> [[TMP10]], <2 x i32> poison) ; FVW2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 6 ; FVW2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <2 x i32>* -; FVW2-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP19]], i32 4, <2 x i1> [[TMP11]], <2 x i32> poison) +; FVW2-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP19]], i32 4, <2 x i1> [[TMP11]], <2 x i32> poison) ; FVW2-NEXT: [[TMP20:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD]] to <2 x i64> -; FVW2-NEXT: [[TMP21:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD10]] to <2 x i64> -; FVW2-NEXT: [[TMP22:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD11]] to <2 x i64> -; FVW2-NEXT: [[TMP23:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD12]] to <2 x i64> +; FVW2-NEXT: [[TMP21:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD11]] to <2 x i64> +; FVW2-NEXT: [[TMP22:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD12]] to <2 x i64> +; FVW2-NEXT: [[TMP23:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD13]] to <2 x i64> ; FVW2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <2 x i64> [[TMP20]] ; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP21]] ; FVW2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP22]] ; FVW2-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP23]] ; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP24]], i32 4, <2 x i1> [[TMP8]], <2 x float> undef) -; FVW2-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP25]], i32 4, <2 x i1> [[TMP9]], <2 x float> undef) -; FVW2-NEXT: [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP26]], i32 4, <2 x i1> [[TMP10]], <2 x float> undef) -; FVW2-NEXT: [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP27]], i32 4, <2 x i1> [[TMP11]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP25]], i32 4, <2 x i1> [[TMP9]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP26]], i32 4, <2 x i1> [[TMP10]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP27]], i32 4, <2 x i1> [[TMP11]], <2 x float> undef) ; FVW2-NEXT: [[TMP28:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], -; FVW2-NEXT: [[TMP29:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], -; FVW2-NEXT: [[TMP30:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], -; FVW2-NEXT: [[TMP31:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], -; FVW2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDEX6]] +; FVW2-NEXT: [[TMP29:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], +; FVW2-NEXT: [[TMP30:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], +; FVW2-NEXT: [[TMP31:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], +; FVW2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDEX7]] ; FVW2-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <2 x float>* ; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP28]], <2 x float>* [[TMP33]], i32 4, <2 x i1> [[TMP8]]) ; FVW2-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP32]], i64 2 @@ -152,7 +152,7 @@ ; FVW2-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[TMP32]], i64 6 ; FVW2-NEXT: [[TMP39:%.*]] = bitcast float* [[TMP38]] to <2 x float>* ; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP31]], <2 x float>* [[TMP39]], i32 4, <2 x i1> [[TMP11]]) -; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX6]], 8 +; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 8 ; FVW2-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; FVW2-NEXT: br i1 [[TMP40]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; FVW2: for.end: @@ -235,128 +235,128 @@ ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) -; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) +; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8]], ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) ; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) -; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) +; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_1]], ; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP8]], <16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP6]]) ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) -; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) +; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_2]], ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP13]], <16 x float*> [[TMP14]], i32 4, <16 x i1> [[TMP11]]) ; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer ; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) -; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) +; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_3]], ; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP18]], <16 x float*> [[TMP19]], i32 4, <16 x i1> [[TMP16]]) ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) -; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) +; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_4]], ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP23]], <16 x float*> [[TMP24]], i32 4, <16 x i1> [[TMP21]]) ; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer ; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) -; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) +; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_5]], ; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP28]], <16 x float*> [[TMP29]], i32 4, <16 x i1> [[TMP26]]) ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) -; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) +; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_6]], ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP33]], <16 x float*> [[TMP34]], i32 4, <16 x i1> [[TMP31]]) ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) -; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) +; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_7]], ; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP38]], <16 x float*> [[TMP39]], i32 4, <16 x i1> [[TMP36]]) ; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer ; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) -; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) +; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_8]], ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP43]], <16 x float*> [[TMP44]], i32 4, <16 x i1> [[TMP41]]) ; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer ; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) -; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) +; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_9]], ; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP48]], <16 x float*> [[TMP49]], i32 4, <16 x i1> [[TMP46]]) ; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer ; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) -; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) +; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_10]], ; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP53]], <16 x float*> [[TMP54]], i32 4, <16 x i1> [[TMP51]]) ; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer ; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) -; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) +; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_11]], ; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP58]], <16 x float*> [[TMP59]], i32 4, <16 x i1> [[TMP56]]) ; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer ; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) -; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) +; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_12]], ; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP63]], <16 x float*> [[TMP64]], i32 4, <16 x i1> [[TMP61]]) ; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer ; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) -; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) +; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_13]], ; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP68]], <16 x float*> [[TMP69]], i32 4, <16 x i1> [[TMP66]]) ; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer ; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) -; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) +; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_14]], ; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP73]], <16 x float*> [[TMP74]], i32 4, <16 x i1> [[TMP71]]) ; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer ; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) -; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) +; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_15]], ; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) ; AVX512-NEXT: ret void @@ -365,9 +365,9 @@ ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ] -; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE8]] ] -; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4 +; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ] +; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ] +; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX7]], 4 ; FVW2-NEXT: [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16 ; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]] ; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] @@ -388,14 +388,14 @@ ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] ; FVW2: pred.store.continue: ; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 -; FVW2-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]] -; FVW2: pred.store.if7: +; FVW2-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]] +; FVW2: pred.store.if8: ; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP0]] ; FVW2-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP9]], i32 1 ; FVW2-NEXT: store float [[TMP15]], float* [[TMP14]], align 4 -; FVW2-NEXT: br label [[PRED_STORE_CONTINUE8]] -; FVW2: pred.store.continue8: -; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX6]], 2 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE9]] +; FVW2: pred.store.continue9: +; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 2 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 ; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] @@ -480,128 +480,128 @@ ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) -; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) +; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) ; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) -; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_1]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) +; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], ; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP8]], <16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP6]]) ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) -; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_2]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) +; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP13]], <16 x float*> [[TMP14]], i32 4, <16 x i1> [[TMP11]]) ; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer ; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) -; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_3]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) +; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], ; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP18]], <16 x float*> [[TMP19]], i32 4, <16 x i1> [[TMP16]]) ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) -; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_4]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) +; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP23]], <16 x float*> [[TMP24]], i32 4, <16 x i1> [[TMP21]]) ; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer ; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) -; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_5]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) +; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], ; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP28]], <16 x float*> [[TMP29]], i32 4, <16 x i1> [[TMP26]]) ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) -; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_6]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) +; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP33]], <16 x float*> [[TMP34]], i32 4, <16 x i1> [[TMP31]]) ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) -; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_7]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) +; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], ; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP38]], <16 x float*> [[TMP39]], i32 4, <16 x i1> [[TMP36]]) ; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer ; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) -; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_8]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) +; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP43]], <16 x float*> [[TMP44]], i32 4, <16 x i1> [[TMP41]]) ; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer ; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) -; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_9]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) +; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], ; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP48]], <16 x float*> [[TMP49]], i32 4, <16 x i1> [[TMP46]]) ; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer ; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) -; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_10]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) +; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], ; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP53]], <16 x float*> [[TMP54]], i32 4, <16 x i1> [[TMP51]]) ; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer ; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) -; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_11]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) +; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], ; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP58]], <16 x float*> [[TMP59]], i32 4, <16 x i1> [[TMP56]]) ; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer ; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) -; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_12]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) +; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], ; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP63]], <16 x float*> [[TMP64]], i32 4, <16 x i1> [[TMP61]]) ; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer ; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) -; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_13]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) +; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], ; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP68]], <16 x float*> [[TMP69]], i32 4, <16 x i1> [[TMP66]]) ; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer ; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) -; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_14]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) +; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], ; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP73]], <16 x float*> [[TMP74]], i32 4, <16 x i1> [[TMP71]]) ; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer ; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) -; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_15]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) +; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], ; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) ; AVX512-NEXT: ret void @@ -610,8 +610,8 @@ ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE7:%.*]] ] -; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE7]] ] +; FVW2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ] +; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE8]] ] ; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 4 ; FVW2-NEXT: [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16 ; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]] @@ -633,13 +633,13 @@ ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] ; FVW2: pred.store.continue: ; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 -; FVW2-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7]] -; FVW2: pred.store.if6: +; FVW2-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]] +; FVW2: pred.store.if7: ; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP0]], i32 1 ; FVW2-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP9]], i32 1 ; FVW2-NEXT: store float [[TMP15]], float* [[TMP14]], align 4 -; FVW2-NEXT: br label [[PRED_STORE_CONTINUE7]] -; FVW2: pred.store.continue7: +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE8]] +; FVW2: pred.store.continue8: ; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 @@ -711,128 +711,128 @@ ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) -; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) +; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8]], ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP3]], <16 x float addrspace(1)*> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) ; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) -; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) +; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_1]], ; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP8]], <16 x float addrspace(1)*> [[TMP9]], i32 4, <16 x i1> [[TMP6]]) ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) -; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) +; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_2]], ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP13]], <16 x float addrspace(1)*> [[TMP14]], i32 4, <16 x i1> [[TMP11]]) ; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer ; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) -; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) +; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_3]], ; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP18]], <16 x float addrspace(1)*> [[TMP19]], i32 4, <16 x i1> [[TMP16]]) ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) -; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) +; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_4]], ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP23]], <16 x float addrspace(1)*> [[TMP24]], i32 4, <16 x i1> [[TMP21]]) ; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer ; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) -; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) +; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_5]], ; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP28]], <16 x float addrspace(1)*> [[TMP29]], i32 4, <16 x i1> [[TMP26]]) ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) -; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) +; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_6]], ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP33]], <16 x float addrspace(1)*> [[TMP34]], i32 4, <16 x i1> [[TMP31]]) ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) -; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) +; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_7]], ; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP38]], <16 x float addrspace(1)*> [[TMP39]], i32 4, <16 x i1> [[TMP36]]) ; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer ; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) -; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) +; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_8]], ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP43]], <16 x float addrspace(1)*> [[TMP44]], i32 4, <16 x i1> [[TMP41]]) ; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer ; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) -; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) +; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_9]], ; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP48]], <16 x float addrspace(1)*> [[TMP49]], i32 4, <16 x i1> [[TMP46]]) ; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer ; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) -; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) +; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_10]], ; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP53]], <16 x float addrspace(1)*> [[TMP54]], i32 4, <16 x i1> [[TMP51]]) ; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer ; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) -; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) +; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_11]], ; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP58]], <16 x float addrspace(1)*> [[TMP59]], i32 4, <16 x i1> [[TMP56]]) ; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer ; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) -; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) +; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_12]], ; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP63]], <16 x float addrspace(1)*> [[TMP64]], i32 4, <16 x i1> [[TMP61]]) ; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer ; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) -; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) +; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_13]], ; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP68]], <16 x float addrspace(1)*> [[TMP69]], i32 4, <16 x i1> [[TMP66]]) ; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer ; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) -; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) +; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_14]], ; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP73]], <16 x float addrspace(1)*> [[TMP74]], i32 4, <16 x i1> [[TMP71]]) ; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer ; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) -; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) +; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_15]], ; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP78]], <16 x float addrspace(1)*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) ; AVX512-NEXT: ret void @@ -841,9 +841,9 @@ ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ] -; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE8]] ] -; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4 +; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ] +; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ] +; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX7]], 4 ; FVW2-NEXT: [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16 ; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]] ; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] @@ -864,14 +864,14 @@ ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] ; FVW2: pred.store.continue: ; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 -; FVW2-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]] -; FVW2: pred.store.if7: +; FVW2-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]] +; FVW2: pred.store.if8: ; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP0]] ; FVW2-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP9]], i32 1 ; FVW2-NEXT: store float [[TMP15]], float addrspace(1)* [[TMP14]], align 4 -; FVW2-NEXT: br label [[PRED_STORE_CONTINUE8]] -; FVW2: pred.store.continue8: -; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX6]], 2 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE9]] +; FVW2: pred.store.continue9: +; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 2 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 ; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -942,128 +942,128 @@ ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) -; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) +; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8]], ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) ; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) -; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) +; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_1]], ; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP8]], <16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP6]]) ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) -; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) +; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_2]], ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP13]], <16 x float*> [[TMP14]], i32 4, <16 x i1> [[TMP11]]) ; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer ; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) -; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) +; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_3]], ; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP18]], <16 x float*> [[TMP19]], i32 4, <16 x i1> [[TMP16]]) ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) -; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) +; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_4]], ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP23]], <16 x float*> [[TMP24]], i32 4, <16 x i1> [[TMP21]]) ; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer ; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) -; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) +; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_5]], ; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP28]], <16 x float*> [[TMP29]], i32 4, <16 x i1> [[TMP26]]) ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) -; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) +; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_6]], ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP33]], <16 x float*> [[TMP34]], i32 4, <16 x i1> [[TMP31]]) ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) -; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) +; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_7]], ; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP38]], <16 x float*> [[TMP39]], i32 4, <16 x i1> [[TMP36]]) ; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer ; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) -; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) +; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_8]], ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP43]], <16 x float*> [[TMP44]], i32 4, <16 x i1> [[TMP41]]) ; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer ; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) -; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) +; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_9]], ; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP48]], <16 x float*> [[TMP49]], i32 4, <16 x i1> [[TMP46]]) ; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer ; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) -; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) +; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_10]], ; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP53]], <16 x float*> [[TMP54]], i32 4, <16 x i1> [[TMP51]]) ; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer ; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) -; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) +; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_11]], ; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP58]], <16 x float*> [[TMP59]], i32 4, <16 x i1> [[TMP56]]) ; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer ; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) -; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) +; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_12]], ; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP63]], <16 x float*> [[TMP64]], i32 4, <16 x i1> [[TMP61]]) ; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer ; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) -; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) +; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_13]], ; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP68]], <16 x float*> [[TMP69]], i32 4, <16 x i1> [[TMP66]]) ; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer ; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) -; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) +; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_14]], ; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP73]], <16 x float*> [[TMP74]], i32 4, <16 x i1> [[TMP71]]) ; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer ; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) -; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) +; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_15]], ; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) ; AVX512-NEXT: ret void @@ -1072,9 +1072,9 @@ ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ] -; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE8]] ] -; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4 +; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ] +; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ] +; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX7]], 4 ; FVW2-NEXT: [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16 ; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]] ; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] @@ -1095,14 +1095,14 @@ ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] ; FVW2: pred.store.continue: ; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 -; FVW2-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]] -; FVW2: pred.store.if7: +; FVW2-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]] +; FVW2: pred.store.if8: ; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP0]] ; FVW2-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP9]], i32 1 ; FVW2-NEXT: store float [[TMP15]], float* [[TMP14]], align 4 -; FVW2-NEXT: br label [[PRED_STORE_CONTINUE8]] -; FVW2: pred.store.continue8: -; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX6]], 2 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE9]] +; FVW2: pred.store.continue9: +; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 2 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 ; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -1173,128 +1173,128 @@ ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer ; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) -; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) +; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8]], ; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP3]], <16 x float addrspace(1)*> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) ; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) -; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) +; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_1]], ; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP8]], <16 x float addrspace(1)*> [[TMP9]], i32 4, <16 x i1> [[TMP6]]) ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) -; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) +; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_2]], ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP13]], <16 x float addrspace(1)*> [[TMP14]], i32 4, <16 x i1> [[TMP11]]) ; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer ; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) -; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) +; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_3]], ; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP18]], <16 x float addrspace(1)*> [[TMP19]], i32 4, <16 x i1> [[TMP16]]) ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) -; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) +; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_4]], ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP23]], <16 x float addrspace(1)*> [[TMP24]], i32 4, <16 x i1> [[TMP21]]) ; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer ; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) -; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) +; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_5]], ; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP28]], <16 x float addrspace(1)*> [[TMP29]], i32 4, <16 x i1> [[TMP26]]) ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) -; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) +; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_6]], ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP33]], <16 x float addrspace(1)*> [[TMP34]], i32 4, <16 x i1> [[TMP31]]) ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) -; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) +; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_7]], ; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP38]], <16 x float addrspace(1)*> [[TMP39]], i32 4, <16 x i1> [[TMP36]]) ; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer ; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) -; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) +; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_8]], ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP43]], <16 x float addrspace(1)*> [[TMP44]], i32 4, <16 x i1> [[TMP41]]) ; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer ; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) -; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) +; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_9]], ; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP48]], <16 x float addrspace(1)*> [[TMP49]], i32 4, <16 x i1> [[TMP46]]) ; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer ; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) -; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) +; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_10]], ; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP53]], <16 x float addrspace(1)*> [[TMP54]], i32 4, <16 x i1> [[TMP51]]) ; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer ; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) -; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) +; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_11]], ; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP58]], <16 x float addrspace(1)*> [[TMP59]], i32 4, <16 x i1> [[TMP56]]) ; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer ; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) -; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) +; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_12]], ; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP63]], <16 x float addrspace(1)*> [[TMP64]], i32 4, <16 x i1> [[TMP61]]) ; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer ; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) -; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) +; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_13]], ; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP68]], <16 x float addrspace(1)*> [[TMP69]], i32 4, <16 x i1> [[TMP66]]) ; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer ; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) -; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) +; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_14]], ; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP73]], <16 x float addrspace(1)*> [[TMP74]], i32 4, <16 x i1> [[TMP71]]) ; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer ; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) -; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], +; AVX512-NEXT: [[WIDE_MASKED_GATHER8_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) +; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER8_15]], ; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP78]], <16 x float addrspace(1)*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) ; AVX512-NEXT: ret void @@ -1303,9 +1303,9 @@ ; FVW2-NEXT: entry: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ] -; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE8]] ] -; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4 +; FVW2-NEXT: [[INDEX7:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ] +; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE9]] ] +; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX7]], 4 ; FVW2-NEXT: [[TMP0:%.*]] = or i64 [[OFFSET_IDX]], 16 ; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[OFFSET_IDX]] ; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] @@ -1326,14 +1326,14 @@ ; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] ; FVW2: pred.store.continue: ; FVW2-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 -; FVW2-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]] -; FVW2: pred.store.if7: +; FVW2-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]] +; FVW2: pred.store.if8: ; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP0]] ; FVW2-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP9]], i32 1 ; FVW2-NEXT: store float [[TMP15]], float addrspace(1)* [[TMP14]], align 4 -; FVW2-NEXT: br label [[PRED_STORE_CONTINUE8]] -; FVW2: pred.store.continue8: -; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX6]], 2 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE9]] +; FVW2: pred.store.continue9: +; FVW2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX7]], 2 ; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 ; FVW2-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pixel-splat.ll b/llvm/test/Transforms/PhaseOrdering/X86/pixel-splat.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/pixel-splat.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/pixel-splat.ll @@ -26,7 +26,7 @@ ; CHECK: for.body.preheader: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[S]] to i64 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[S]], 8 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER4:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER5:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967288 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -37,9 +37,9 @@ ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 4 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>* -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1 ; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[WIDE_LOAD3]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[WIDE_LOAD4]] to <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw <4 x i32> [[TMP4]], ; CHECK-NEXT: [[TMP7:%.*]] = mul nuw nsw <4 x i32> [[TMP5]], ; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i32> [[TMP4]], @@ -57,12 +57,12 @@ ; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER4]] -; CHECK: for.body.preheader4: +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER5]] +; CHECK: for.body.preheader5: ; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER4]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER5]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[PIN]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP17:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP17]] to i32 diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll @@ -19,28 +19,28 @@ ; CHECK: for.body.preheader: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER17:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER18:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[X:%.*]], i64 [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: [[SCEVGEP6:%.*]] = getelementptr double, double* [[Y:%.*]], i64 [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt double* [[SCEVGEP6]], [[X]] +; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr double, double* [[Y:%.*]], i64 [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt double* [[SCEVGEP7]], [[X]] ; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt double* [[SCEVGEP]], [[Y]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[FOR_BODY_PREHEADER17]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[FOR_BODY_PREHEADER18]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967280 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A:%.*]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <4 x double> poison, double [[A]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT11]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT13:%.*]] = insertelement <4 x double> poison, double [[A]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT14:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT13]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <4 x double> poison, double [[A]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT15]], <4 x double> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <4 x double> poison, double [[A]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT12]], <4 x double> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x double> poison, double [[A]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT14]], <4 x double> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <4 x double> poison, double [[A]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT17:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT16]], <4 x double> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP0:%.*]] = fdiv fast <4 x double> , [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP1:%.*]] = fdiv fast <4 x double> , [[BROADCAST_SPLAT12]] -; CHECK-NEXT: [[TMP2:%.*]] = fdiv fast <4 x double> , [[BROADCAST_SPLAT14]] -; CHECK-NEXT: [[TMP3:%.*]] = fdiv fast <4 x double> , [[BROADCAST_SPLAT16]] +; CHECK-NEXT: [[TMP1:%.*]] = fdiv fast <4 x double> , [[BROADCAST_SPLAT13]] +; CHECK-NEXT: [[TMP2:%.*]] = fdiv fast <4 x double> , [[BROADCAST_SPLAT15]] +; CHECK-NEXT: [[TMP3:%.*]] = fdiv fast <4 x double> , [[BROADCAST_SPLAT17]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -49,17 +49,17 @@ ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, <4 x double>* [[TMP5]], align 8, !tbaa [[TBAA3:![0-9]+]], !alias.scope !7 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP4]], i64 4 ; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <4 x double>* -; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x double>, <4 x double>* [[TMP7]], align 8, !tbaa [[TBAA3]], !alias.scope !7 +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x double>, <4 x double>* [[TMP7]], align 8, !tbaa [[TBAA3]], !alias.scope !7 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, double* [[TMP4]], i64 8 ; CHECK-NEXT: [[TMP9:%.*]] = bitcast double* [[TMP8]] to <4 x double>* -; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x double>, <4 x double>* [[TMP9]], align 8, !tbaa [[TBAA3]], !alias.scope !7 +; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x double>, <4 x double>* [[TMP9]], align 8, !tbaa [[TBAA3]], !alias.scope !7 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[TMP4]], i64 12 ; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[TMP10]] to <4 x double>* -; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x double>, <4 x double>* [[TMP11]], align 8, !tbaa [[TBAA3]], !alias.scope !7 +; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x double>, <4 x double>* [[TMP11]], align 8, !tbaa [[TBAA3]], !alias.scope !7 ; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <4 x double> [[WIDE_LOAD]], [[TMP0]] -; CHECK-NEXT: [[TMP13:%.*]] = fmul fast <4 x double> [[WIDE_LOAD8]], [[TMP1]] -; CHECK-NEXT: [[TMP14:%.*]] = fmul fast <4 x double> [[WIDE_LOAD9]], [[TMP2]] -; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <4 x double> [[WIDE_LOAD10]], [[TMP3]] +; CHECK-NEXT: [[TMP13:%.*]] = fmul fast <4 x double> [[WIDE_LOAD9]], [[TMP1]] +; CHECK-NEXT: [[TMP14:%.*]] = fmul fast <4 x double> [[WIDE_LOAD10]], [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <4 x double> [[WIDE_LOAD11]], [[TMP3]] ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>* ; CHECK-NEXT: store <4 x double> [[TMP12]], <4 x double>* [[TMP17]], align 8, !tbaa [[TBAA3]], !alias.scope !10, !noalias !7 @@ -77,8 +77,8 @@ ; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER17]] -; CHECK: for.body.preheader17: +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER18]] +; CHECK: for.body.preheader18: ; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[TMP25:%.*]] = xor i64 [[INDVARS_IV_PH]], -1 ; CHECK-NEXT: [[TMP26:%.*]] = add nsw i64 [[TMP25]], [[WIDE_TRIP_COUNT]] @@ -101,17 +101,17 @@ ; CHECK-NEXT: [[PROL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[PROL_ITER_SUB]], 0 ; CHECK-NEXT: br i1 [[PROL_ITER_CMP_NOT]], label [[FOR_BODY_PROL_LOOPEXIT]], label [[FOR_BODY_PROL]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: for.body.prol.loopexit: -; CHECK-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER17]] ], [ [[INDVARS_IV_NEXT_PROL]], [[FOR_BODY_PROL]] ] +; CHECK-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER18]] ], [ [[INDVARS_IV_NEXT_PROL]], [[FOR_BODY_PROL]] ] ; CHECK-NEXT: [[TMP29:%.*]] = icmp ult i64 [[TMP26]], 3 -; CHECK-NEXT: br i1 [[TMP29]], label [[FOR_END]], label [[FOR_BODY_PREHEADER17_NEW:%.*]] -; CHECK: for.body.preheader17.new: +; CHECK-NEXT: br i1 [[TMP29]], label [[FOR_END]], label [[FOR_BODY_PREHEADER18_NEW:%.*]] +; CHECK: for.body.preheader18.new: ; CHECK-NEXT: [[TMP30:%.*]] = fdiv fast double 1.000000e+00, [[A]] ; CHECK-NEXT: [[TMP31:%.*]] = fdiv fast double 1.000000e+00, [[A]] ; CHECK-NEXT: [[TMP32:%.*]] = fdiv fast double 1.000000e+00, [[A]] ; CHECK-NEXT: [[TMP33:%.*]] = fdiv fast double 1.000000e+00, [[A]] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], [[FOR_BODY_PREHEADER17_NEW]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], [[FOR_BODY_PREHEADER18_NEW]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[T0:%.*]] = load double, double* [[ARRAYIDX]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[TMP34:%.*]] = fmul fast double [[T0]], [[TMP30]]