Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1909,11 +1909,9 @@ return; } - // If we haven't yet vectorized the induction variable, splat the scalar - // induction variable, and build the necessary step vectors. - // TODO: Don't do it unless the vectorized IV is really required. + // All IV users are scalar instructions, so only emit a scalar IV, not a + // vectorised IV. Value *ScalarIV = CreateScalarIV(Step); - //CreateSplatIV(ScalarIV, Step); buildScalarSteps(ScalarIV, Step, EntryVal, ID); } @@ -4589,9 +4587,8 @@ if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction) continue; - // If tail-folding is applied, the primary induction variable is splat to a - // vector and used by the masked load/store vector instructions, thus the - // IV does not remain scalar. + // If tail-folding is applied, the primary induction variable will be used + // to feed a vector compare. if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) continue; Index: llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll @@ -16,9 +16,6 @@ ; FORCED-LABEL: vector.body: ; preds = %vector.body, %vector.ph ; FORCED-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] -; FORCED-NEXT: %broadcast.splatinsert = insertelement <2 x i32> undef, i32 %index, i32 0 -; FORCED-NEXT: %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer -; FORCED-NEXT: %induction = add <2 x i32> %broadcast.splat, ; FORCED-NEXT: %0 = add i32 %index, 0 ; FORCED-NEXT: %1 = extractvalue { i64, i64 } %sv, 0 ; FORCED-NEXT: %2 = extractvalue { i64, i64 } %sv, 0 @@ -68,9 +65,6 @@ ; FORCED-LABEL: vector.body: ; preds = %vector.body, %vector.ph ; FORCED-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] -; FORCED-NEXT: %broadcast.splatinsert = insertelement <2 x i32> undef, i32 %index, i32 0 -; FORCED-NEXT: %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer -; FORCED-NEXT: %induction = add <2 x i32> %broadcast.splat, ; FORCED-NEXT: %0 = add i32 %index, 0 ; FORCED-NEXT: %1 = extractvalue { float, float } %sv, 0 ; FORCED-NEXT: %2 = extractvalue { float, float } %sv, 0 Index: llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll @@ -65,15 +65,9 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[OFFSET_IDX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = add i64 [[TMP0]], [[INDEX]] ; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[OFFSET_IDX4]] to i32 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> undef, i32 [[TMP18]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION7:%.*]] = add <4 x i32> [[BROADCAST_SPLAT6]], ; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 0 ; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[CONV]], [[TMP19]] ; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP20]] to i64 Index: llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll +++ llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll @@ -44,9 +44,6 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> undef, i32 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i32 [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0 Index: llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll +++ llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll @@ -19,9 +19,6 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> undef, i16 [[OFFSET_IDX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[TMP0]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 [[TMP1]] @@ -32,6 +29,25 @@ ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2 ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 ; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 2, 2 +; CHECK-NEXT: br i1 [[CMP_N]], label [[BB3:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 2, [[MIDDLE_BLOCK]] ], [ 0, [[BB1:%.*]] ] +; CHECK-NEXT: br label [[BB2:%.*]] +; CHECK: bb2: +; CHECK-NEXT: [[C_1_0:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[_TMP9:%.*]], [[BB2]] ] +; CHECK-NEXT: [[_TMP1:%.*]] = zext i16 0 to i64 +; CHECK-NEXT: [[_TMP2:%.*]] = getelementptr [1 x %rec8], [1 x %rec8]* @a, i16 0, i64 [[_TMP1]] +; CHECK-NEXT: [[_TMP4:%.*]] = bitcast %rec8* [[_TMP2]] to i16* +; CHECK-NEXT: [[_TMP6:%.*]] = sext i16 [[C_1_0]] to i64 +; CHECK-NEXT: [[_TMP7:%.*]] = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 [[_TMP6]] +; CHECK-NEXT: store i16* [[_TMP4]], i16** [[_TMP7]] +; CHECK-NEXT: [[_TMP9]] = add nsw i16 [[C_1_0]], 1 +; CHECK-NEXT: [[_TMP11:%.*]] = icmp slt i16 [[_TMP9]], 2 +; CHECK-NEXT: br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop !2 +; CHECK: bb3: +; CHECK-NEXT: ret void +; bb1: br label %bb2 Index: llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll +++ llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll @@ -97,9 +97,6 @@ ; AVX: vector.body: ; AVX-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX-NEXT: [[VEC_PHI:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ] -; AVX-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 -; AVX-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer -; AVX-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], ; AVX-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; AVX-NEXT: [[TMP1:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[TMP0]] ; AVX-NEXT: [[TMP2:%.*]] = getelementptr double, double* [[TMP1]], i32 0 Index: llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll +++ llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll @@ -165,15 +165,9 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION1:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION2:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION3:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 @@ -247,37 +241,37 @@ ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 ; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4 ; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP71]], align 4 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP71]], align 4 ; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8 ; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP73]], align 4 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP73]], align 4 ; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12 ; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP75]], align 4 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP75]], align 4 ; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], ; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], ; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], ; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI10:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_LOAD7]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI11:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_LOAD8]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI12:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_LOAD9]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_LOAD4]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_LOAD5]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_LOAD6]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI10]] -; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI11]] -; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI12]] +; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] +; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] +; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX13:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX14:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX13]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX14]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX15:%.*]] = add <4 x i32> [[BIN_RDX14]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF16:%.*]] = shufflevector <4 x i32> [[BIN_RDX15]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX17:%.*]] = add <4 x i32> [[BIN_RDX15]], [[RDX_SHUF16]] -; CHECK-NEXT: [[TMP85:%.*]] = extractelement <4 x i32> [[BIN_RDX17]], i32 0 +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX11]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX12:%.*]] = add <4 x i32> [[BIN_RDX11]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF13:%.*]] = shufflevector <4 x i32> [[BIN_RDX12]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX14:%.*]] = add <4 x i32> [[BIN_RDX12]], [[RDX_SHUF13]] +; CHECK-NEXT: [[TMP85:%.*]] = extractelement <4 x i32> [[BIN_RDX14]], i32 0 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -346,15 +340,9 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP100:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP101:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP102:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP103:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION1:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION2:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION3:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP101:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP102:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP103:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 @@ -456,25 +444,25 @@ ; CHECK-NEXT: [[TMP98:%.*]] = xor <4 x i1> [[TMP55]], ; CHECK-NEXT: [[TMP99:%.*]] = xor <4 x i1> [[TMP63]], ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP71]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP79]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP87]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP95]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI4:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP79]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI5:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP87]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI6:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP95]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP100]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP101]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI7]] -; CHECK-NEXT: [[TMP102]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI8]] -; CHECK-NEXT: [[TMP103]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI9]] +; CHECK-NEXT: [[TMP101]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI4]] +; CHECK-NEXT: [[TMP102]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI5]] +; CHECK-NEXT: [[TMP103]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI6]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP104:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP104]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6 ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP101]], [[TMP100]] -; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP102]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP103]], [[BIN_RDX10]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX11]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX12:%.*]] = add <4 x i32> [[BIN_RDX11]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF13:%.*]] = shufflevector <4 x i32> [[BIN_RDX12]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX14:%.*]] = add <4 x i32> [[BIN_RDX12]], [[RDX_SHUF13]] -; CHECK-NEXT: [[TMP105:%.*]] = extractelement <4 x i32> [[BIN_RDX14]], i32 0 +; CHECK-NEXT: [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP102]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP103]], [[BIN_RDX7]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX8]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX9:%.*]] = add <4 x i32> [[BIN_RDX8]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF10:%.*]] = shufflevector <4 x i32> [[BIN_RDX9]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[BIN_RDX9]], [[RDX_SHUF10]] +; CHECK-NEXT: [[TMP105:%.*]] = extractelement <4 x i32> [[BIN_RDX11]], i32 0 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -536,17 +524,11 @@ ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE36:%.*]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE36]] ] -; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP181:%.*]], [[PRED_LOAD_CONTINUE36]] ] -; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP182:%.*]], [[PRED_LOAD_CONTINUE36]] ] -; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP183:%.*]], [[PRED_LOAD_CONTINUE36]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION1:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION2:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION3:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE33:%.*]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE33]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP181:%.*]], [[PRED_LOAD_CONTINUE33]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP182:%.*]], [[PRED_LOAD_CONTINUE33]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP183:%.*]], [[PRED_LOAD_CONTINUE33]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 @@ -623,194 +605,194 @@ ; CHECK: pred.load.continue: ; CHECK-NEXT: [[TMP70:%.*]] = phi <4 x i32> [ undef, [[VECTOR_BODY]] ], [ [[TMP69]], [[PRED_LOAD_IF]] ] ; CHECK-NEXT: [[TMP71:%.*]] = extractelement <4 x i1> [[TMP39]], i32 1 -; CHECK-NEXT: br i1 [[TMP71]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] -; CHECK: pred.load.if7: +; CHECK-NEXT: br i1 [[TMP71]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]] +; CHECK: pred.load.if4: ; CHECK-NEXT: [[TMP72:%.*]] = bitcast i32* [[BASE]] to i16* ; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i16, i16* [[TMP72]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP74:%.*]] = bitcast i16* [[TMP73]] to i32* ; CHECK-NEXT: [[TMP75:%.*]] = load i32, i32* [[TMP74]] ; CHECK-NEXT: [[TMP76:%.*]] = insertelement <4 x i32> [[TMP70]], i32 [[TMP75]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE8]] -; CHECK: pred.load.continue8: -; CHECK-NEXT: [[TMP77:%.*]] = phi <4 x i32> [ [[TMP70]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP76]], [[PRED_LOAD_IF7]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE5]] +; CHECK: pred.load.continue5: +; CHECK-NEXT: [[TMP77:%.*]] = phi <4 x i32> [ [[TMP70]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP76]], [[PRED_LOAD_IF4]] ] ; CHECK-NEXT: [[TMP78:%.*]] = extractelement <4 x i1> [[TMP39]], i32 2 -; CHECK-NEXT: br i1 [[TMP78]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] -; CHECK: pred.load.if9: +; CHECK-NEXT: br i1 [[TMP78]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7:%.*]] +; CHECK: pred.load.if6: ; CHECK-NEXT: [[TMP79:%.*]] = bitcast i32* [[BASE]] to i16* ; CHECK-NEXT: [[TMP80:%.*]] = getelementptr inbounds i16, i16* [[TMP79]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP81:%.*]] = bitcast i16* [[TMP80]] to i32* ; CHECK-NEXT: [[TMP82:%.*]] = load i32, i32* [[TMP81]] ; CHECK-NEXT: [[TMP83:%.*]] = insertelement <4 x i32> [[TMP77]], i32 [[TMP82]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE10]] -; CHECK: pred.load.continue10: -; CHECK-NEXT: [[TMP84:%.*]] = phi <4 x i32> [ [[TMP77]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP83]], [[PRED_LOAD_IF9]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE7]] +; CHECK: pred.load.continue7: +; CHECK-NEXT: [[TMP84:%.*]] = phi <4 x i32> [ [[TMP77]], [[PRED_LOAD_CONTINUE5]] ], [ [[TMP83]], [[PRED_LOAD_IF6]] ] ; CHECK-NEXT: [[TMP85:%.*]] = extractelement <4 x i1> [[TMP39]], i32 3 -; CHECK-NEXT: br i1 [[TMP85]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] -; CHECK: pred.load.if11: +; CHECK-NEXT: br i1 [[TMP85]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]] +; CHECK: pred.load.if8: ; CHECK-NEXT: [[TMP86:%.*]] = bitcast i32* [[BASE]] to i16* ; CHECK-NEXT: [[TMP87:%.*]] = getelementptr inbounds i16, i16* [[TMP86]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP88:%.*]] = bitcast i16* [[TMP87]] to i32* ; CHECK-NEXT: [[TMP89:%.*]] = load i32, i32* [[TMP88]] ; CHECK-NEXT: [[TMP90:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP89]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE12]] -; CHECK: pred.load.continue12: -; CHECK-NEXT: [[TMP91:%.*]] = phi <4 x i32> [ [[TMP84]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP90]], [[PRED_LOAD_IF11]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE9]] +; CHECK: pred.load.continue9: +; CHECK-NEXT: [[TMP91:%.*]] = phi <4 x i32> [ [[TMP84]], [[PRED_LOAD_CONTINUE7]] ], [ [[TMP90]], [[PRED_LOAD_IF8]] ] ; CHECK-NEXT: [[TMP92:%.*]] = extractelement <4 x i1> [[TMP47]], i32 0 -; CHECK-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] -; CHECK: pred.load.if13: +; CHECK-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]] +; CHECK: pred.load.if10: ; CHECK-NEXT: [[TMP93:%.*]] = bitcast i32* [[BASE]] to i16* ; CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds i16, i16* [[TMP93]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP95:%.*]] = bitcast i16* [[TMP94]] to i32* ; CHECK-NEXT: [[TMP96:%.*]] = load i32, i32* [[TMP95]] ; CHECK-NEXT: [[TMP97:%.*]] = insertelement <4 x i32> undef, i32 [[TMP96]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE14]] -; CHECK: pred.load.continue14: -; CHECK-NEXT: [[TMP98:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE12]] ], [ [[TMP97]], [[PRED_LOAD_IF13]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE11]] +; CHECK: pred.load.continue11: +; CHECK-NEXT: [[TMP98:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP97]], [[PRED_LOAD_IF10]] ] ; CHECK-NEXT: [[TMP99:%.*]] = extractelement <4 x i1> [[TMP47]], i32 1 -; CHECK-NEXT: br i1 [[TMP99]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] -; CHECK: pred.load.if15: +; CHECK-NEXT: br i1 [[TMP99]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]] +; CHECK: pred.load.if12: ; CHECK-NEXT: [[TMP100:%.*]] = bitcast i32* [[BASE]] to i16* ; CHECK-NEXT: [[TMP101:%.*]] = getelementptr inbounds i16, i16* [[TMP100]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP102:%.*]] = bitcast i16* [[TMP101]] to i32* ; CHECK-NEXT: [[TMP103:%.*]] = load i32, i32* [[TMP102]] ; CHECK-NEXT: [[TMP104:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP103]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE16]] -; CHECK: pred.load.continue16: -; CHECK-NEXT: [[TMP105:%.*]] = phi <4 x i32> [ [[TMP98]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP104]], [[PRED_LOAD_IF15]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE13]] +; CHECK: pred.load.continue13: +; CHECK-NEXT: [[TMP105:%.*]] = phi <4 x i32> [ [[TMP98]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP104]], [[PRED_LOAD_IF12]] ] ; CHECK-NEXT: [[TMP106:%.*]] = extractelement <4 x i1> [[TMP47]], i32 2 -; CHECK-NEXT: br i1 [[TMP106]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] -; CHECK: pred.load.if17: +; CHECK-NEXT: br i1 [[TMP106]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]] +; CHECK: pred.load.if14: ; CHECK-NEXT: [[TMP107:%.*]] = bitcast i32* [[BASE]] to i16* ; CHECK-NEXT: [[TMP108:%.*]] = getelementptr inbounds i16, i16* [[TMP107]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP109:%.*]] = bitcast i16* [[TMP108]] to i32* ; CHECK-NEXT: [[TMP110:%.*]] = load i32, i32* [[TMP109]] ; CHECK-NEXT: [[TMP111:%.*]] = insertelement <4 x i32> [[TMP105]], i32 [[TMP110]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE18]] -; CHECK: pred.load.continue18: -; CHECK-NEXT: [[TMP112:%.*]] = phi <4 x i32> [ [[TMP105]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP111]], [[PRED_LOAD_IF17]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE15]] +; CHECK: pred.load.continue15: +; CHECK-NEXT: [[TMP112:%.*]] = phi <4 x i32> [ [[TMP105]], [[PRED_LOAD_CONTINUE13]] ], [ [[TMP111]], [[PRED_LOAD_IF14]] ] ; CHECK-NEXT: [[TMP113:%.*]] = extractelement <4 x i1> [[TMP47]], i32 3 -; CHECK-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] -; CHECK: pred.load.if19: +; CHECK-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]] +; CHECK: pred.load.if16: ; CHECK-NEXT: [[TMP114:%.*]] = bitcast i32* [[BASE]] to i16* ; CHECK-NEXT: [[TMP115:%.*]] = getelementptr inbounds i16, i16* [[TMP114]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP116:%.*]] = bitcast i16* [[TMP115]] to i32* ; CHECK-NEXT: [[TMP117:%.*]] = load i32, i32* [[TMP116]] ; CHECK-NEXT: [[TMP118:%.*]] = insertelement <4 x i32> [[TMP112]], i32 [[TMP117]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE20]] -; CHECK: pred.load.continue20: -; CHECK-NEXT: [[TMP119:%.*]] = phi <4 x i32> [ [[TMP112]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP118]], [[PRED_LOAD_IF19]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE17]] +; CHECK: pred.load.continue17: +; CHECK-NEXT: [[TMP119:%.*]] = phi <4 x i32> [ [[TMP112]], [[PRED_LOAD_CONTINUE15]] ], [ [[TMP118]], [[PRED_LOAD_IF16]] ] ; CHECK-NEXT: [[TMP120:%.*]] = extractelement <4 x i1> [[TMP55]], i32 0 -; CHECK-NEXT: br i1 [[TMP120]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] -; CHECK: pred.load.if21: +; CHECK-NEXT: br i1 [[TMP120]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]] +; CHECK: pred.load.if18: ; CHECK-NEXT: [[TMP121:%.*]] = bitcast i32* [[BASE]] to i16* ; CHECK-NEXT: [[TMP122:%.*]] = getelementptr inbounds i16, i16* [[TMP121]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP123:%.*]] = bitcast i16* [[TMP122]] to i32* ; CHECK-NEXT: [[TMP124:%.*]] = load i32, i32* [[TMP123]] ; CHECK-NEXT: [[TMP125:%.*]] = insertelement <4 x i32> undef, i32 [[TMP124]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE22]] -; CHECK: pred.load.continue22: -; CHECK-NEXT: [[TMP126:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE20]] ], [ [[TMP125]], [[PRED_LOAD_IF21]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE19]] +; CHECK: pred.load.continue19: +; CHECK-NEXT: [[TMP126:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP125]], [[PRED_LOAD_IF18]] ] ; CHECK-NEXT: [[TMP127:%.*]] = extractelement <4 x i1> [[TMP55]], i32 1 -; CHECK-NEXT: br i1 [[TMP127]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] -; CHECK: pred.load.if23: +; CHECK-NEXT: br i1 [[TMP127]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]] +; CHECK: pred.load.if20: ; CHECK-NEXT: [[TMP128:%.*]] = bitcast i32* [[BASE]] to i16* ; CHECK-NEXT: [[TMP129:%.*]] = getelementptr inbounds i16, i16* [[TMP128]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP130:%.*]] = bitcast i16* [[TMP129]] to i32* ; CHECK-NEXT: [[TMP131:%.*]] = load i32, i32* [[TMP130]] ; CHECK-NEXT: [[TMP132:%.*]] = insertelement <4 x i32> [[TMP126]], i32 [[TMP131]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE24]] -; CHECK: pred.load.continue24: -; CHECK-NEXT: [[TMP133:%.*]] = phi <4 x i32> [ [[TMP126]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP132]], [[PRED_LOAD_IF23]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE21]] +; CHECK: pred.load.continue21: +; CHECK-NEXT: [[TMP133:%.*]] = phi <4 x i32> [ [[TMP126]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP132]], [[PRED_LOAD_IF20]] ] ; CHECK-NEXT: [[TMP134:%.*]] = extractelement <4 x i1> [[TMP55]], i32 2 -; CHECK-NEXT: br i1 [[TMP134]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] -; CHECK: pred.load.if25: +; CHECK-NEXT: br i1 [[TMP134]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]] +; CHECK: pred.load.if22: ; CHECK-NEXT: [[TMP135:%.*]] = bitcast i32* [[BASE]] to i16* ; CHECK-NEXT: [[TMP136:%.*]] = getelementptr inbounds i16, i16* [[TMP135]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP137:%.*]] = bitcast i16* [[TMP136]] to i32* ; CHECK-NEXT: [[TMP138:%.*]] = load i32, i32* [[TMP137]] ; CHECK-NEXT: [[TMP139:%.*]] = insertelement <4 x i32> [[TMP133]], i32 [[TMP138]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE26]] -; CHECK: pred.load.continue26: -; CHECK-NEXT: [[TMP140:%.*]] = phi <4 x i32> [ [[TMP133]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP139]], [[PRED_LOAD_IF25]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE23]] +; CHECK: pred.load.continue23: +; CHECK-NEXT: [[TMP140:%.*]] = phi <4 x i32> [ [[TMP133]], [[PRED_LOAD_CONTINUE21]] ], [ [[TMP139]], [[PRED_LOAD_IF22]] ] ; CHECK-NEXT: [[TMP141:%.*]] = extractelement <4 x i1> [[TMP55]], i32 3 -; CHECK-NEXT: br i1 [[TMP141]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] -; CHECK: pred.load.if27: +; CHECK-NEXT: br i1 [[TMP141]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]] +; CHECK: pred.load.if24: ; CHECK-NEXT: [[TMP142:%.*]] = bitcast i32* [[BASE]] to i16* ; CHECK-NEXT: [[TMP143:%.*]] = getelementptr inbounds i16, i16* [[TMP142]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP144:%.*]] = bitcast i16* [[TMP143]] to i32* ; CHECK-NEXT: [[TMP145:%.*]] = load i32, i32* [[TMP144]] ; CHECK-NEXT: [[TMP146:%.*]] = insertelement <4 x i32> [[TMP140]], i32 [[TMP145]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE28]] -; CHECK: pred.load.continue28: -; CHECK-NEXT: [[TMP147:%.*]] = phi <4 x i32> [ [[TMP140]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP146]], [[PRED_LOAD_IF27]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE25]] +; CHECK: pred.load.continue25: +; CHECK-NEXT: [[TMP147:%.*]] = phi <4 x i32> [ [[TMP140]], [[PRED_LOAD_CONTINUE23]] ], [ [[TMP146]], [[PRED_LOAD_IF24]] ] ; CHECK-NEXT: [[TMP148:%.*]] = extractelement <4 x i1> [[TMP63]], i32 0 -; CHECK-NEXT: br i1 [[TMP148]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] -; CHECK: pred.load.if29: +; CHECK-NEXT: br i1 [[TMP148]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27:%.*]] +; CHECK: pred.load.if26: ; CHECK-NEXT: [[TMP149:%.*]] = bitcast i32* [[BASE]] to i16* ; CHECK-NEXT: [[TMP150:%.*]] = getelementptr inbounds i16, i16* [[TMP149]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP151:%.*]] = bitcast i16* [[TMP150]] to i32* ; CHECK-NEXT: [[TMP152:%.*]] = load i32, i32* [[TMP151]] ; CHECK-NEXT: [[TMP153:%.*]] = insertelement <4 x i32> undef, i32 [[TMP152]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE30]] -; CHECK: pred.load.continue30: -; CHECK-NEXT: [[TMP154:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE28]] ], [ [[TMP153]], [[PRED_LOAD_IF29]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE27]] +; CHECK: pred.load.continue27: +; CHECK-NEXT: [[TMP154:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE25]] ], [ [[TMP153]], [[PRED_LOAD_IF26]] ] ; CHECK-NEXT: [[TMP155:%.*]] = extractelement <4 x i1> [[TMP63]], i32 1 -; CHECK-NEXT: br i1 [[TMP155]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] -; CHECK: pred.load.if31: +; CHECK-NEXT: br i1 [[TMP155]], label [[PRED_LOAD_IF28:%.*]], label [[PRED_LOAD_CONTINUE29:%.*]] +; CHECK: pred.load.if28: ; CHECK-NEXT: [[TMP156:%.*]] = bitcast i32* [[BASE]] to i16* ; CHECK-NEXT: [[TMP157:%.*]] = getelementptr inbounds i16, i16* [[TMP156]], i64 [[TMP13]] ; CHECK-NEXT: [[TMP158:%.*]] = bitcast i16* [[TMP157]] to i32* ; CHECK-NEXT: [[TMP159:%.*]] = load i32, i32* [[TMP158]] ; CHECK-NEXT: [[TMP160:%.*]] = insertelement <4 x i32> [[TMP154]], i32 [[TMP159]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE32]] -; CHECK: pred.load.continue32: -; CHECK-NEXT: [[TMP161:%.*]] = phi <4 x i32> [ [[TMP154]], [[PRED_LOAD_CONTINUE30]] ], [ [[TMP160]], [[PRED_LOAD_IF31]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE29]] +; CHECK: pred.load.continue29: +; CHECK-NEXT: [[TMP161:%.*]] = phi <4 x i32> [ [[TMP154]], [[PRED_LOAD_CONTINUE27]] ], [ [[TMP160]], [[PRED_LOAD_IF28]] ] ; CHECK-NEXT: [[TMP162:%.*]] = extractelement <4 x i1> [[TMP63]], i32 2 -; CHECK-NEXT: br i1 [[TMP162]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] -; CHECK: pred.load.if33: +; CHECK-NEXT: br i1 [[TMP162]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]] +; CHECK: pred.load.if30: ; CHECK-NEXT: [[TMP163:%.*]] = bitcast i32* [[BASE]] to i16* ; CHECK-NEXT: [[TMP164:%.*]] = getelementptr inbounds i16, i16* [[TMP163]], i64 [[TMP14]] ; CHECK-NEXT: [[TMP165:%.*]] = bitcast i16* [[TMP164]] to i32* ; CHECK-NEXT: [[TMP166:%.*]] = load i32, i32* [[TMP165]] ; CHECK-NEXT: [[TMP167:%.*]] = insertelement <4 x i32> [[TMP161]], i32 [[TMP166]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE34]] -; CHECK: pred.load.continue34: -; CHECK-NEXT: [[TMP168:%.*]] = phi <4 x i32> [ [[TMP161]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP167]], [[PRED_LOAD_IF33]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE31]] +; CHECK: pred.load.continue31: +; CHECK-NEXT: [[TMP168:%.*]] = phi <4 x i32> [ [[TMP161]], [[PRED_LOAD_CONTINUE29]] ], [ [[TMP167]], [[PRED_LOAD_IF30]] ] ; CHECK-NEXT: [[TMP169:%.*]] = extractelement <4 x i1> [[TMP63]], i32 3 -; CHECK-NEXT: br i1 [[TMP169]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36]] -; CHECK: pred.load.if35: +; CHECK-NEXT: br i1 [[TMP169]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33]] +; CHECK: pred.load.if32: ; CHECK-NEXT: [[TMP170:%.*]] = bitcast i32* [[BASE]] to i16* ; CHECK-NEXT: [[TMP171:%.*]] = getelementptr inbounds i16, i16* [[TMP170]], i64 [[TMP15]] ; CHECK-NEXT: [[TMP172:%.*]] = bitcast i16* [[TMP171]] to i32* ; CHECK-NEXT: [[TMP173:%.*]] = load i32, i32* [[TMP172]] ; CHECK-NEXT: [[TMP174:%.*]] = insertelement <4 x i32> [[TMP168]], i32 [[TMP173]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE36]] -; CHECK: pred.load.continue36: -; CHECK-NEXT: [[TMP175:%.*]] = phi <4 x i32> [ [[TMP168]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP174]], [[PRED_LOAD_IF35]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE33]] +; CHECK: pred.load.continue33: +; CHECK-NEXT: [[TMP175:%.*]] = phi <4 x i32> [ [[TMP168]], [[PRED_LOAD_CONTINUE31]] ], [ [[TMP174]], [[PRED_LOAD_IF32]] ] ; CHECK-NEXT: [[TMP176:%.*]] = xor <4 x i1> [[TMP39]], ; CHECK-NEXT: [[TMP177:%.*]] = xor <4 x i1> [[TMP47]], ; CHECK-NEXT: [[TMP178:%.*]] = xor <4 x i1> [[TMP55]], ; CHECK-NEXT: [[TMP179:%.*]] = xor <4 x i1> [[TMP63]], ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP91]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI37:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP119]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI38:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP147]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI39:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP175]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI34:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP119]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI35:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP147]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI36:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP175]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP180]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP181]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI37]] -; CHECK-NEXT: [[TMP182]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI38]] -; CHECK-NEXT: [[TMP183]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI39]] +; CHECK-NEXT: [[TMP181]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI34]] +; CHECK-NEXT: [[TMP182]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI35]] +; CHECK-NEXT: [[TMP183]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI36]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP184:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP184]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8 ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP181]], [[TMP180]] -; CHECK-NEXT: [[BIN_RDX40:%.*]] = add <4 x i32> [[TMP182]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX41:%.*]] = add <4 x i32> [[TMP183]], [[BIN_RDX40]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX41]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX42:%.*]] = add <4 x i32> [[BIN_RDX41]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF43:%.*]] = shufflevector <4 x i32> [[BIN_RDX42]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX44:%.*]] = add <4 x i32> [[BIN_RDX42]], [[RDX_SHUF43]] -; CHECK-NEXT: [[TMP185:%.*]] = extractelement <4 x i32> [[BIN_RDX44]], i32 0 +; CHECK-NEXT: [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP182]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP183]], [[BIN_RDX37]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX38]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX39:%.*]] = add <4 x i32> [[BIN_RDX38]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF40:%.*]] = shufflevector <4 x i32> [[BIN_RDX39]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX41:%.*]] = add <4 x i32> [[BIN_RDX39]], [[RDX_SHUF40]] +; CHECK-NEXT: [[TMP185:%.*]] = extractelement <4 x i32> [[BIN_RDX41]], i32 0 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -885,15 +867,9 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP84:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION1:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION2:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION3:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP84:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 2 @@ -967,37 +943,37 @@ ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP70]], i32 4, <4 x i1> [[TMP40]], <4 x i32> undef) ; CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 4 ; CHECK-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP72]], i32 4, <4 x i1> [[TMP48]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP72]], i32 4, <4 x i1> [[TMP48]], <4 x i32> undef) ; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 8 ; CHECK-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP74]], i32 4, <4 x i1> [[TMP56]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP74]], i32 4, <4 x i1> [[TMP56]], <4 x i32> undef) ; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TMP65]], i32 12 ; CHECK-NEXT: [[TMP76:%.*]] = bitcast i32* [[TMP75]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP76]], i32 4, <4 x i1> [[TMP64]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP76]], i32 4, <4 x i1> [[TMP64]], <4 x i32> undef) ; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP40]], ; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP48]], ; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP56]], ; CHECK-NEXT: [[TMP80:%.*]] = xor <4 x i1> [[TMP64]], ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP40]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI10:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[WIDE_MASKED_LOAD7]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI11:%.*]] = select <4 x i1> [[TMP56]], <4 x i32> [[WIDE_MASKED_LOAD8]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI12:%.*]] = select <4 x i1> [[TMP64]], <4 x i32> [[WIDE_MASKED_LOAD9]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP48]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP56]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP64]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI10]] -; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI11]] -; CHECK-NEXT: [[TMP84]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI12]] +; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] +; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] +; CHECK-NEXT: [[TMP84]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP85:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP85]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10 ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP82]], [[TMP81]] -; CHECK-NEXT: [[BIN_RDX13:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX14:%.*]] = add <4 x i32> [[TMP84]], [[BIN_RDX13]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX14]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX15:%.*]] = add <4 x i32> [[BIN_RDX14]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF16:%.*]] = shufflevector <4 x i32> [[BIN_RDX15]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX17:%.*]] = add <4 x i32> [[BIN_RDX15]], [[RDX_SHUF16]] -; CHECK-NEXT: [[TMP86:%.*]] = extractelement <4 x i32> [[BIN_RDX17]], i32 0 +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP84]], [[BIN_RDX10]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX11]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX12:%.*]] = add <4 x i32> [[BIN_RDX11]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF13:%.*]] = shufflevector <4 x i32> [[BIN_RDX12]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX14:%.*]] = add <4 x i32> [[BIN_RDX12]], [[RDX_SHUF13]] +; CHECK-NEXT: [[TMP86:%.*]] = extractelement <4 x i32> [[BIN_RDX14]], i32 0 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -1067,16 +1043,10 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1024, [[INDEX]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[OFFSET_IDX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION1:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION2:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION3:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 2 @@ -1150,37 +1120,37 @@ ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> undef) ; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4 ; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> undef) ; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8 ; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> undef) ; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12 ; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> undef) ; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], ; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], ; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], ; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI10:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD7]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI11:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD8]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI12:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD9]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI10]] -; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI11]] -; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI12]] +; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] +; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] +; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 3072 ; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !12 ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX13:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX14:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX13]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX14]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX15:%.*]] = add <4 x i32> [[BIN_RDX14]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF16:%.*]] = shufflevector <4 x i32> [[BIN_RDX15]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX17:%.*]] = add <4 x i32> [[BIN_RDX15]], [[RDX_SHUF16]] -; CHECK-NEXT: [[TMP85:%.*]] = extractelement <4 x i32> [[BIN_RDX17]], i32 0 +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX11]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX12:%.*]] = add <4 x i32> [[BIN_RDX11]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF13:%.*]] = shufflevector <4 x i32> [[BIN_RDX12]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX14:%.*]] = add <4 x i32> [[BIN_RDX12]], [[RDX_SHUF13]] +; CHECK-NEXT: [[TMP85:%.*]] = extractelement <4 x i32> [[BIN_RDX14]], i32 0 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 3072, 3072 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -1298,18 +1268,12 @@ ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE36:%.*]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP148:%.*]], [[PRED_LOAD_CONTINUE36]] ] -; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP149:%.*]], [[PRED_LOAD_CONTINUE36]] ] -; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP150:%.*]], [[PRED_LOAD_CONTINUE36]] ] -; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP151:%.*]], [[PRED_LOAD_CONTINUE36]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE33:%.*]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP148:%.*]], [[PRED_LOAD_CONTINUE33]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP149:%.*]], [[PRED_LOAD_CONTINUE33]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP150:%.*]], [[PRED_LOAD_CONTINUE33]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP151:%.*]], [[PRED_LOAD_CONTINUE33]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[OFFSET_IDX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION1:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION2:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION3:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 4 @@ -1384,164 +1348,164 @@ ; CHECK: pred.load.continue: ; CHECK-NEXT: [[TMP68:%.*]] = phi <4 x i32> [ undef, [[VECTOR_BODY]] ], [ [[TMP67]], [[PRED_LOAD_IF]] ] ; CHECK-NEXT: [[TMP69:%.*]] = extractelement <4 x i1> [[TMP39]], i32 1 -; CHECK-NEXT: br i1 [[TMP69]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] -; CHECK: pred.load.if7: +; CHECK-NEXT: br i1 [[TMP69]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]] +; CHECK: pred.load.if4: ; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP71:%.*]] = load i32, i32* [[TMP70]] ; CHECK-NEXT: [[TMP72:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP71]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE8]] -; CHECK: pred.load.continue8: -; CHECK-NEXT: [[TMP73:%.*]] = phi <4 x i32> [ [[TMP68]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP72]], [[PRED_LOAD_IF7]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE5]] +; CHECK: pred.load.continue5: +; CHECK-NEXT: [[TMP73:%.*]] = phi <4 x i32> [ [[TMP68]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP72]], [[PRED_LOAD_IF4]] ] ; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x i1> [[TMP39]], i32 2 -; CHECK-NEXT: br i1 [[TMP74]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] -; CHECK: pred.load.if9: +; CHECK-NEXT: br i1 [[TMP74]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7:%.*]] +; CHECK: pred.load.if6: ; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP76:%.*]] = load i32, i32* [[TMP75]] ; CHECK-NEXT: [[TMP77:%.*]] = insertelement <4 x i32> [[TMP73]], i32 [[TMP76]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE10]] -; CHECK: pred.load.continue10: -; CHECK-NEXT: [[TMP78:%.*]] = phi <4 x i32> [ [[TMP73]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP77]], [[PRED_LOAD_IF9]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE7]] +; CHECK: pred.load.continue7: +; CHECK-NEXT: [[TMP78:%.*]] = phi <4 x i32> [ [[TMP73]], [[PRED_LOAD_CONTINUE5]] ], [ [[TMP77]], [[PRED_LOAD_IF6]] ] ; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i1> [[TMP39]], i32 3 -; CHECK-NEXT: br i1 [[TMP79]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] -; CHECK: pred.load.if11: +; CHECK-NEXT: br i1 [[TMP79]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]] +; CHECK: pred.load.if8: ; CHECK-NEXT: [[TMP80:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP81:%.*]] = load i32, i32* [[TMP80]] ; CHECK-NEXT: [[TMP82:%.*]] = insertelement <4 x i32> [[TMP78]], i32 [[TMP81]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE12]] -; CHECK: pred.load.continue12: -; CHECK-NEXT: [[TMP83:%.*]] = phi <4 x i32> [ [[TMP78]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP82]], [[PRED_LOAD_IF11]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE9]] +; CHECK: pred.load.continue9: +; CHECK-NEXT: [[TMP83:%.*]] = phi <4 x i32> [ [[TMP78]], [[PRED_LOAD_CONTINUE7]] ], [ [[TMP82]], [[PRED_LOAD_IF8]] ] ; CHECK-NEXT: [[TMP84:%.*]] = extractelement <4 x i1> [[TMP47]], i32 0 -; CHECK-NEXT: br i1 [[TMP84]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] -; CHECK: pred.load.if13: +; CHECK-NEXT: br i1 [[TMP84]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]] +; CHECK: pred.load.if10: ; CHECK-NEXT: [[TMP85:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP86:%.*]] = load i32, i32* [[TMP85]] ; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x i32> undef, i32 [[TMP86]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE14]] -; CHECK: pred.load.continue14: -; CHECK-NEXT: [[TMP88:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE12]] ], [ [[TMP87]], [[PRED_LOAD_IF13]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE11]] +; CHECK: pred.load.continue11: +; CHECK-NEXT: [[TMP88:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP87]], [[PRED_LOAD_IF10]] ] ; CHECK-NEXT: [[TMP89:%.*]] = extractelement <4 x i1> [[TMP47]], i32 1 -; CHECK-NEXT: br i1 [[TMP89]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] -; CHECK: pred.load.if15: +; CHECK-NEXT: br i1 [[TMP89]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]] +; CHECK: pred.load.if12: ; CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP91:%.*]] = load i32, i32* [[TMP90]] ; CHECK-NEXT: [[TMP92:%.*]] = insertelement <4 x i32> [[TMP88]], i32 [[TMP91]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE16]] -; CHECK: pred.load.continue16: -; CHECK-NEXT: [[TMP93:%.*]] = phi <4 x i32> [ [[TMP88]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP92]], [[PRED_LOAD_IF15]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE13]] +; CHECK: pred.load.continue13: +; CHECK-NEXT: [[TMP93:%.*]] = phi <4 x i32> [ [[TMP88]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP92]], [[PRED_LOAD_IF12]] ] ; CHECK-NEXT: [[TMP94:%.*]] = extractelement <4 x i1> [[TMP47]], i32 2 -; CHECK-NEXT: br i1 [[TMP94]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] -; CHECK: pred.load.if17: +; CHECK-NEXT: br i1 [[TMP94]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]] +; CHECK: pred.load.if14: ; CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP96:%.*]] = load i32, i32* [[TMP95]] ; CHECK-NEXT: [[TMP97:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP96]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE18]] -; CHECK: pred.load.continue18: -; CHECK-NEXT: [[TMP98:%.*]] = phi <4 x i32> [ [[TMP93]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP97]], [[PRED_LOAD_IF17]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE15]] +; CHECK: pred.load.continue15: +; CHECK-NEXT: [[TMP98:%.*]] = phi <4 x i32> [ [[TMP93]], [[PRED_LOAD_CONTINUE13]] ], [ [[TMP97]], [[PRED_LOAD_IF14]] ] ; CHECK-NEXT: [[TMP99:%.*]] = extractelement <4 x i1> [[TMP47]], i32 3 -; CHECK-NEXT: br i1 [[TMP99]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] -; CHECK: pred.load.if19: +; CHECK-NEXT: br i1 [[TMP99]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]] +; CHECK: pred.load.if16: ; CHECK-NEXT: [[TMP100:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP101:%.*]] = load i32, i32* [[TMP100]] ; CHECK-NEXT: [[TMP102:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP101]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE20]] -; CHECK: pred.load.continue20: -; CHECK-NEXT: [[TMP103:%.*]] = phi <4 x i32> [ [[TMP98]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP102]], [[PRED_LOAD_IF19]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE17]] +; CHECK: pred.load.continue17: +; CHECK-NEXT: [[TMP103:%.*]] = phi <4 x i32> [ [[TMP98]], [[PRED_LOAD_CONTINUE15]] ], [ [[TMP102]], [[PRED_LOAD_IF16]] ] ; CHECK-NEXT: [[TMP104:%.*]] = extractelement <4 x i1> [[TMP55]], i32 0 -; CHECK-NEXT: br i1 [[TMP104]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] -; CHECK: pred.load.if21: +; CHECK-NEXT: br i1 [[TMP104]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]] +; CHECK: pred.load.if18: ; CHECK-NEXT: [[TMP105:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP106:%.*]] = load i32, i32* [[TMP105]] ; CHECK-NEXT: [[TMP107:%.*]] = insertelement <4 x i32> undef, i32 [[TMP106]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE22]] -; CHECK: pred.load.continue22: -; CHECK-NEXT: [[TMP108:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE20]] ], [ [[TMP107]], [[PRED_LOAD_IF21]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE19]] +; CHECK: pred.load.continue19: +; CHECK-NEXT: [[TMP108:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP107]], [[PRED_LOAD_IF18]] ] ; CHECK-NEXT: [[TMP109:%.*]] = extractelement <4 x i1> [[TMP55]], i32 1 -; CHECK-NEXT: br i1 [[TMP109]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] -; CHECK: pred.load.if23: +; CHECK-NEXT: br i1 [[TMP109]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]] +; CHECK: pred.load.if20: ; CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP111:%.*]] = load i32, i32* [[TMP110]] ; CHECK-NEXT: [[TMP112:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP111]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE24]] -; CHECK: pred.load.continue24: -; CHECK-NEXT: [[TMP113:%.*]] = phi <4 x i32> [ [[TMP108]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP112]], [[PRED_LOAD_IF23]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE21]] +; CHECK: pred.load.continue21: +; CHECK-NEXT: [[TMP113:%.*]] = phi <4 x i32> [ [[TMP108]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP112]], [[PRED_LOAD_IF20]] ] ; CHECK-NEXT: [[TMP114:%.*]] = extractelement <4 x i1> [[TMP55]], i32 2 -; CHECK-NEXT: br i1 [[TMP114]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] -; CHECK: pred.load.if25: +; CHECK-NEXT: br i1 [[TMP114]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]] +; CHECK: pred.load.if22: ; CHECK-NEXT: [[TMP115:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP116:%.*]] = load i32, i32* [[TMP115]] ; CHECK-NEXT: [[TMP117:%.*]] = insertelement <4 x i32> [[TMP113]], i32 [[TMP116]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE26]] -; CHECK: pred.load.continue26: -; CHECK-NEXT: [[TMP118:%.*]] = phi <4 x i32> [ [[TMP113]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP117]], [[PRED_LOAD_IF25]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE23]] +; CHECK: pred.load.continue23: +; CHECK-NEXT: [[TMP118:%.*]] = phi <4 x i32> [ [[TMP113]], [[PRED_LOAD_CONTINUE21]] ], [ [[TMP117]], [[PRED_LOAD_IF22]] ] ; CHECK-NEXT: [[TMP119:%.*]] = extractelement <4 x i1> [[TMP55]], i32 3 -; CHECK-NEXT: br i1 [[TMP119]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] -; CHECK: pred.load.if27: +; CHECK-NEXT: br i1 [[TMP119]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]] +; CHECK: pred.load.if24: ; CHECK-NEXT: [[TMP120:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP121:%.*]] = load i32, i32* [[TMP120]] ; CHECK-NEXT: [[TMP122:%.*]] = insertelement <4 x i32> [[TMP118]], i32 [[TMP121]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE28]] -; CHECK: pred.load.continue28: -; CHECK-NEXT: [[TMP123:%.*]] = phi <4 x i32> [ [[TMP118]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP122]], [[PRED_LOAD_IF27]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE25]] +; CHECK: pred.load.continue25: +; CHECK-NEXT: [[TMP123:%.*]] = phi <4 x i32> [ [[TMP118]], [[PRED_LOAD_CONTINUE23]] ], [ [[TMP122]], [[PRED_LOAD_IF24]] ] ; CHECK-NEXT: [[TMP124:%.*]] = extractelement <4 x i1> [[TMP63]], i32 0 -; CHECK-NEXT: br i1 [[TMP124]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] -; CHECK: pred.load.if29: +; CHECK-NEXT: br i1 [[TMP124]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27:%.*]] +; CHECK: pred.load.if26: ; CHECK-NEXT: [[TMP125:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP126:%.*]] = load i32, i32* [[TMP125]] ; CHECK-NEXT: [[TMP127:%.*]] = insertelement <4 x i32> undef, i32 [[TMP126]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE30]] -; CHECK: pred.load.continue30: -; CHECK-NEXT: [[TMP128:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE28]] ], [ [[TMP127]], [[PRED_LOAD_IF29]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE27]] +; CHECK: pred.load.continue27: +; CHECK-NEXT: [[TMP128:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE25]] ], [ [[TMP127]], [[PRED_LOAD_IF26]] ] ; CHECK-NEXT: [[TMP129:%.*]] = extractelement <4 x i1> [[TMP63]], i32 1 -; CHECK-NEXT: br i1 [[TMP129]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] -; CHECK: pred.load.if31: +; CHECK-NEXT: br i1 [[TMP129]], label [[PRED_LOAD_IF28:%.*]], label [[PRED_LOAD_CONTINUE29:%.*]] +; CHECK: pred.load.if28: ; CHECK-NEXT: [[TMP130:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP13]] ; CHECK-NEXT: [[TMP131:%.*]] = load i32, i32* [[TMP130]] ; CHECK-NEXT: [[TMP132:%.*]] = insertelement <4 x i32> [[TMP128]], i32 [[TMP131]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE32]] -; CHECK: pred.load.continue32: -; CHECK-NEXT: [[TMP133:%.*]] = phi <4 x i32> [ [[TMP128]], [[PRED_LOAD_CONTINUE30]] ], [ [[TMP132]], [[PRED_LOAD_IF31]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE29]] +; CHECK: pred.load.continue29: +; CHECK-NEXT: [[TMP133:%.*]] = phi <4 x i32> [ [[TMP128]], [[PRED_LOAD_CONTINUE27]] ], [ [[TMP132]], [[PRED_LOAD_IF28]] ] ; CHECK-NEXT: [[TMP134:%.*]] = extractelement <4 x i1> [[TMP63]], i32 2 -; CHECK-NEXT: br i1 [[TMP134]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] -; CHECK: pred.load.if33: +; CHECK-NEXT: br i1 [[TMP134]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]] +; CHECK: pred.load.if30: ; CHECK-NEXT: [[TMP135:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP14]] ; CHECK-NEXT: [[TMP136:%.*]] = load i32, i32* [[TMP135]] ; CHECK-NEXT: [[TMP137:%.*]] = insertelement <4 x i32> [[TMP133]], i32 [[TMP136]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE34]] -; CHECK: pred.load.continue34: -; CHECK-NEXT: [[TMP138:%.*]] = phi <4 x i32> [ [[TMP133]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP137]], [[PRED_LOAD_IF33]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE31]] +; CHECK: pred.load.continue31: +; CHECK-NEXT: [[TMP138:%.*]] = phi <4 x i32> [ [[TMP133]], [[PRED_LOAD_CONTINUE29]] ], [ [[TMP137]], [[PRED_LOAD_IF30]] ] ; CHECK-NEXT: [[TMP139:%.*]] = extractelement <4 x i1> [[TMP63]], i32 3 -; CHECK-NEXT: br i1 [[TMP139]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36]] -; CHECK: pred.load.if35: +; CHECK-NEXT: br i1 [[TMP139]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33]] +; CHECK: pred.load.if32: ; CHECK-NEXT: [[TMP140:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP15]] ; CHECK-NEXT: [[TMP141:%.*]] = load i32, i32* [[TMP140]] ; CHECK-NEXT: [[TMP142:%.*]] = insertelement <4 x i32> [[TMP138]], i32 [[TMP141]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE36]] -; CHECK: pred.load.continue36: -; CHECK-NEXT: [[TMP143:%.*]] = phi <4 x i32> [ [[TMP138]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP142]], [[PRED_LOAD_IF35]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE33]] +; CHECK: pred.load.continue33: +; CHECK-NEXT: [[TMP143:%.*]] = phi <4 x i32> [ [[TMP138]], [[PRED_LOAD_CONTINUE31]] ], [ [[TMP142]], [[PRED_LOAD_IF32]] ] ; CHECK-NEXT: [[TMP144:%.*]] = xor <4 x i1> [[TMP39]], ; CHECK-NEXT: [[TMP145:%.*]] = xor <4 x i1> [[TMP47]], ; CHECK-NEXT: [[TMP146:%.*]] = xor <4 x i1> [[TMP55]], ; CHECK-NEXT: [[TMP147:%.*]] = xor <4 x i1> [[TMP63]], ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP83]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI37:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP103]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI38:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP123]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI39:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP143]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI34:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP103]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI35:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP123]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI36:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP143]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP148]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP149]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI37]] -; CHECK-NEXT: [[TMP150]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI38]] -; CHECK-NEXT: [[TMP151]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI39]] +; CHECK-NEXT: [[TMP149]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI34]] +; CHECK-NEXT: [[TMP150]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI35]] +; CHECK-NEXT: [[TMP151]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI36]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP152:%.*]] = icmp eq i64 [[INDEX_NEXT]], 2048 ; CHECK-NEXT: br i1 [[TMP152]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !14 ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP149]], [[TMP148]] -; CHECK-NEXT: [[BIN_RDX40:%.*]] = add <4 x i32> [[TMP150]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX41:%.*]] = add <4 x i32> [[TMP151]], [[BIN_RDX40]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX41]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX42:%.*]] = add <4 x i32> [[BIN_RDX41]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF43:%.*]] = shufflevector <4 x i32> [[BIN_RDX42]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX44:%.*]] = add <4 x i32> [[BIN_RDX42]], [[RDX_SHUF43]] -; CHECK-NEXT: [[TMP153:%.*]] = extractelement <4 x i32> [[BIN_RDX44]], i32 0 +; CHECK-NEXT: [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP150]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP151]], [[BIN_RDX37]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX38]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX39:%.*]] = add <4 x i32> [[BIN_RDX38]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF40:%.*]] = shufflevector <4 x i32> [[BIN_RDX39]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX41:%.*]] = add <4 x i32> [[BIN_RDX39]], [[RDX_SHUF40]] +; CHECK-NEXT: [[TMP153:%.*]] = extractelement <4 x i32> [[BIN_RDX41]], i32 0 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 2048, 2048 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -1606,15 +1570,9 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION1:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION2:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION3:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 @@ -1688,37 +1646,37 @@ ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> undef) ; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4 ; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> undef) ; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8 ; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> undef) ; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12 ; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> undef) ; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], ; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], ; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], ; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI10:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD7]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI11:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD8]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI12:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD9]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI10]] -; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI11]] -; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI12]] +; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] +; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] +; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !16 ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX13:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX14:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX13]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX14]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX15:%.*]] = add <4 x i32> [[BIN_RDX14]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF16:%.*]] = shufflevector <4 x i32> [[BIN_RDX15]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX17:%.*]] = add <4 x i32> [[BIN_RDX15]], [[RDX_SHUF16]] -; CHECK-NEXT: [[TMP85:%.*]] = extractelement <4 x i32> [[BIN_RDX17]], i32 0 +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX11]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX12:%.*]] = add <4 x i32> [[BIN_RDX11]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF13:%.*]] = shufflevector <4 x i32> [[BIN_RDX12]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX14:%.*]] = add <4 x i32> [[BIN_RDX12]], [[RDX_SHUF13]] +; CHECK-NEXT: [[TMP85:%.*]] = extractelement <4 x i32> [[BIN_RDX14]], i32 0 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -1783,15 +1741,9 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION1:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION2:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION3:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 @@ -1865,37 +1817,37 @@ ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> undef) ; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4 ; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> undef) ; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8 ; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> undef) ; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12 ; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> undef) ; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], ; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], ; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], ; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI10:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD7]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI11:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD8]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI12:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD9]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI10]] -; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI11]] -; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI12]] +; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] +; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] +; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !18 ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX13:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX14:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX13]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX14]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX15:%.*]] = add <4 x i32> [[BIN_RDX14]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF16:%.*]] = shufflevector <4 x i32> [[BIN_RDX15]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX17:%.*]] = add <4 x i32> [[BIN_RDX15]], [[RDX_SHUF16]] -; CHECK-NEXT: [[TMP85:%.*]] = extractelement <4 x i32> [[BIN_RDX17]], i32 0 +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX11]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX12:%.*]] = add <4 x i32> [[BIN_RDX11]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF13:%.*]] = shufflevector <4 x i32> [[BIN_RDX12]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX14:%.*]] = add <4 x i32> [[BIN_RDX12]], [[RDX_SHUF13]] +; CHECK-NEXT: [[TMP85:%.*]] = extractelement <4 x i32> [[BIN_RDX14]], i32 0 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -1960,15 +1912,9 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION1:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION2:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION3:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP82:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP83:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 @@ -2042,37 +1988,37 @@ ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> undef) ; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4 ; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> undef) ; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8 ; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> undef) ; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12 ; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> undef) ; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], ; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], ; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], ; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI10:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD7]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI11:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD8]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI12:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD9]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD4]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI8:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD5]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI9:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD6]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] -; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI10]] -; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI11]] -; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI6]], [[PREDPHI12]] +; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI7]] +; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI8]] +; CHECK-NEXT: [[TMP83]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI9]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; CHECK-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !20 ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP81]], [[TMP80]] -; CHECK-NEXT: [[BIN_RDX13:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] -; CHECK-NEXT: [[BIN_RDX14:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX13]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX14]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX15:%.*]] = add <4 x i32> [[BIN_RDX14]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF16:%.*]] = shufflevector <4 x i32> [[BIN_RDX15]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX17:%.*]] = add <4 x i32> [[BIN_RDX15]], [[RDX_SHUF16]] -; CHECK-NEXT: [[TMP85:%.*]] = extractelement <4 x i32> [[BIN_RDX17]], i32 0 +; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP82]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP83]], [[BIN_RDX10]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX11]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX12:%.*]] = add <4 x i32> [[BIN_RDX11]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF13:%.*]] = shufflevector <4 x i32> [[BIN_RDX12]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX14:%.*]] = add <4 x i32> [[BIN_RDX12]], [[RDX_SHUF13]] +; CHECK-NEXT: [[TMP85:%.*]] = extractelement <4 x i32> [[BIN_RDX14]], i32 0 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: Index: llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -44,9 +44,6 @@ ; AVX1-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX1: vector.body: ; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0 -; AVX1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer -; AVX1-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], ; AVX1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 @@ -117,12 +114,6 @@ ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX2: vector.body: ; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0 -; AVX2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer -; AVX2-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[INDUCTION12:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[INDUCTION13:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[INDUCTION14:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], ; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; AVX2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 ; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 @@ -136,17 +127,17 @@ ; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !0 ; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8 ; AVX2-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD15:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4, !alias.scope !0 +; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4, !alias.scope !0 ; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16 ; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD16:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !0 +; AVX2-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !0 ; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 24 ; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD17:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4, !alias.scope !0 +; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4, !alias.scope !0 ; AVX2-NEXT: [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], -; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD15]], -; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD16]], -; AVX2-NEXT: [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD17]], +; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], +; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], +; AVX2-NEXT: [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD14]], ; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP0]] ; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] ; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]] @@ -156,17 +147,17 @@ ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x i32> undef), !alias.scope !3 ; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8 ; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x i32> undef), !alias.scope !3 +; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x i32> undef), !alias.scope !3 ; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 16 ; AVX2-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD19:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x i32> undef), !alias.scope !3 +; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x i32> undef), !alias.scope !3 ; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 24 ; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD20:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x i32> undef), !alias.scope !3 +; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x i32> undef), !alias.scope !3 ; AVX2-NEXT: [[TMP32:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] -; AVX2-NEXT: [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD18]], [[WIDE_LOAD15]] -; AVX2-NEXT: [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD19]], [[WIDE_LOAD16]] -; AVX2-NEXT: [[TMP35:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD20]], [[WIDE_LOAD17]] +; AVX2-NEXT: [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] +; AVX2-NEXT: [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] +; AVX2-NEXT: [[TMP35:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]] ; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP0]] ; AVX2-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]] ; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]] @@ -238,12 +229,6 @@ ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> undef, i64 [[INDEX]], i32 0 -; AVX512-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> undef, <16 x i32> zeroinitializer -; AVX512-NEXT: [[INDUCTION:%.*]] = add <16 x i64> [[BROADCAST_SPLAT]], -; AVX512-NEXT: [[INDUCTION12:%.*]] = add <16 x i64> [[BROADCAST_SPLAT]], -; AVX512-NEXT: [[INDUCTION13:%.*]] = add <16 x i64> [[BROADCAST_SPLAT]], -; AVX512-NEXT: [[INDUCTION14:%.*]] = add <16 x i64> [[BROADCAST_SPLAT]], ; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; AVX512-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 16 ; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 32 @@ -257,17 +242,17 @@ ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 4, !alias.scope !0 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16 ; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4, !alias.scope !0 +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4, !alias.scope !0 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 32 ; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD16:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4, !alias.scope !0 +; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4, !alias.scope !0 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 48 ; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD17:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4, !alias.scope !0 +; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4, !alias.scope !0 ; AVX512-NEXT: [[TMP16:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], -; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD15]], -; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD16]], -; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD17]], +; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], +; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD13]], +; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD14]], ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP0]] ; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]] ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]] @@ -277,17 +262,17 @@ ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x i32> undef), !alias.scope !3 ; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 16 ; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x i32> undef), !alias.scope !3 +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x i32> undef), !alias.scope !3 ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 32 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD19:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x i32> undef), !alias.scope !3 +; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x i32> undef), !alias.scope !3 ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 48 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD20:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x i32> undef), !alias.scope !3 +; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x i32> undef), !alias.scope !3 ; AVX512-NEXT: [[TMP32:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] -; AVX512-NEXT: [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD18]], [[WIDE_LOAD15]] -; AVX512-NEXT: [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD19]], [[WIDE_LOAD16]] -; AVX512-NEXT: [[TMP35:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD20]], [[WIDE_LOAD17]] +; AVX512-NEXT: [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] +; AVX512-NEXT: [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] +; AVX512-NEXT: [[TMP35:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]] ; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP0]] ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]] ; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]] @@ -389,9 +374,6 @@ ; AVX1-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX1: vector.body: ; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0 -; AVX1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer -; AVX1-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], ; AVX1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP1]], i32 0 @@ -462,12 +444,6 @@ ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX2: vector.body: ; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0 -; AVX2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer -; AVX2-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[INDUCTION12:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[INDUCTION13:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[INDUCTION14:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], ; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; AVX2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 ; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 @@ -481,17 +457,17 @@ ; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP9]], align 4, !alias.scope !11 ; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 8 ; AVX2-NEXT: [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD15:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP11]], align 4, !alias.scope !11 +; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP11]], align 4, !alias.scope !11 ; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 16 ; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD16:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP13]], align 4, !alias.scope !11 +; AVX2-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP13]], align 4, !alias.scope !11 ; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 24 ; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD17:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP15]], align 4, !alias.scope !11 +; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP15]], align 4, !alias.scope !11 ; AVX2-NEXT: [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], -; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD15]], -; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD16]], -; AVX2-NEXT: [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD17]], +; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], +; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], +; AVX2-NEXT: [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD14]], ; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP0]] ; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP1]] ; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP2]] @@ -501,17 +477,17 @@ ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x i32> undef), !alias.scope !14 ; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 8 ; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x i32> undef), !alias.scope !14 +; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x i32> undef), !alias.scope !14 ; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 16 ; AVX2-NEXT: [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD19:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x i32> undef), !alias.scope !14 +; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x i32> undef), !alias.scope !14 ; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 24 ; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD20:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x i32> undef), !alias.scope !14 +; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x i32> undef), !alias.scope !14 ; AVX2-NEXT: [[TMP32:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] -; AVX2-NEXT: [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD18]], [[WIDE_LOAD15]] -; AVX2-NEXT: [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD19]], [[WIDE_LOAD16]] -; AVX2-NEXT: [[TMP35:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD20]], [[WIDE_LOAD17]] +; AVX2-NEXT: [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] +; AVX2-NEXT: [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] +; AVX2-NEXT: [[TMP35:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]] ; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP0]] ; AVX2-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP1]] ; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP2]] @@ -583,12 +559,6 @@ ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> undef, i64 [[INDEX]], i32 0 -; AVX512-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> undef, <16 x i32> zeroinitializer -; AVX512-NEXT: [[INDUCTION:%.*]] = add <16 x i64> [[BROADCAST_SPLAT]], -; AVX512-NEXT: [[INDUCTION12:%.*]] = add <16 x i64> [[BROADCAST_SPLAT]], -; AVX512-NEXT: [[INDUCTION13:%.*]] = add <16 x i64> [[BROADCAST_SPLAT]], -; AVX512-NEXT: [[INDUCTION14:%.*]] = add <16 x i64> [[BROADCAST_SPLAT]], ; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; AVX512-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 16 ; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 32 @@ -602,17 +572,17 @@ ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP9]], align 4, !alias.scope !11 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 16 ; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP11]], align 4, !alias.scope !11 +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP11]], align 4, !alias.scope !11 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 32 ; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD16:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP13]], align 4, !alias.scope !11 +; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP13]], align 4, !alias.scope !11 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 48 ; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD17:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP15]], align 4, !alias.scope !11 +; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP15]], align 4, !alias.scope !11 ; AVX512-NEXT: [[TMP16:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], -; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD15]], -; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD16]], -; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD17]], +; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], +; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD13]], +; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD14]], ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP0]] ; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP1]] ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP2]] @@ -622,17 +592,17 @@ ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x i32> undef), !alias.scope !14 ; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 16 ; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x i32> undef), !alias.scope !14 +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x i32> undef), !alias.scope !14 ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 32 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD19:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x i32> undef), !alias.scope !14 +; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x i32> undef), !alias.scope !14 ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 48 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD20:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x i32> undef), !alias.scope !14 +; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x i32> undef), !alias.scope !14 ; AVX512-NEXT: [[TMP32:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] -; AVX512-NEXT: [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD18]], [[WIDE_LOAD15]] -; AVX512-NEXT: [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD19]], [[WIDE_LOAD16]] -; AVX512-NEXT: [[TMP35:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD20]], [[WIDE_LOAD17]] +; AVX512-NEXT: [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] +; AVX512-NEXT: [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] +; AVX512-NEXT: [[TMP35:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]] ; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP0]] ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP1]] ; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP2]] @@ -743,9 +713,6 @@ ; AVX1-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX1: vector.body: ; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0 -; AVX1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer -; AVX1-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], ; AVX1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 @@ -818,12 +785,6 @@ ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX2: vector.body: ; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0 -; AVX2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer -; AVX2-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[INDUCTION12:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[INDUCTION13:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[INDUCTION14:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], ; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; AVX2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 ; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 @@ -837,17 +798,17 @@ ; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !21 ; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8 ; AVX2-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD15:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4, !alias.scope !21 +; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4, !alias.scope !21 ; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16 ; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD16:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !21 +; AVX2-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !21 ; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 24 ; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD17:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4, !alias.scope !21 +; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4, !alias.scope !21 ; AVX2-NEXT: [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], -; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD15]], -; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD16]], -; AVX2-NEXT: [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD17]], +; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], +; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], +; AVX2-NEXT: [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD14]], ; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP0]] ; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP1]] ; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP2]] @@ -857,21 +818,21 @@ ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x float> undef), !alias.scope !24 ; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 8 ; AVX2-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x float> undef), !alias.scope !24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x float> undef), !alias.scope !24 ; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 16 ; AVX2-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD19:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x float> undef), !alias.scope !24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x float> undef), !alias.scope !24 ; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 24 ; AVX2-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD20:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x float> undef), !alias.scope !24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x float> undef), !alias.scope !24 ; AVX2-NEXT: [[TMP32:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float> -; AVX2-NEXT: [[TMP33:%.*]] = sitofp <8 x i32> [[WIDE_LOAD15]] to <8 x float> -; AVX2-NEXT: [[TMP34:%.*]] = sitofp <8 x i32> [[WIDE_LOAD16]] to <8 x float> -; AVX2-NEXT: [[TMP35:%.*]] = sitofp <8 x i32> [[WIDE_LOAD17]] to <8 x float> +; AVX2-NEXT: [[TMP33:%.*]] = sitofp <8 x i32> [[WIDE_LOAD12]] to <8 x float> +; AVX2-NEXT: [[TMP34:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x float> +; AVX2-NEXT: [[TMP35:%.*]] = sitofp <8 x i32> [[WIDE_LOAD14]] to <8 x float> ; AVX2-NEXT: [[TMP36:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP32]] -; AVX2-NEXT: [[TMP37:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD18]], [[TMP33]] -; AVX2-NEXT: [[TMP38:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD19]], [[TMP34]] -; AVX2-NEXT: [[TMP39:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD20]], [[TMP35]] +; AVX2-NEXT: [[TMP37:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD15]], [[TMP33]] +; AVX2-NEXT: [[TMP38:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD16]], [[TMP34]] +; AVX2-NEXT: [[TMP39:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD17]], [[TMP35]] ; AVX2-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] ; AVX2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]] ; AVX2-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]] @@ -944,12 +905,6 @@ ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> undef, i64 [[INDEX]], i32 0 -; AVX512-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> undef, <16 x i32> zeroinitializer -; AVX512-NEXT: [[INDUCTION:%.*]] = add <16 x i64> [[BROADCAST_SPLAT]], -; AVX512-NEXT: [[INDUCTION12:%.*]] = add <16 x i64> [[BROADCAST_SPLAT]], -; AVX512-NEXT: [[INDUCTION13:%.*]] = add <16 x i64> [[BROADCAST_SPLAT]], -; AVX512-NEXT: [[INDUCTION14:%.*]] = add <16 x i64> [[BROADCAST_SPLAT]], ; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; AVX512-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 16 ; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 32 @@ -963,17 +918,17 @@ ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 4, !alias.scope !21 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16 ; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4, !alias.scope !21 +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4, !alias.scope !21 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 32 ; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD16:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4, !alias.scope !21 +; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4, !alias.scope !21 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 48 ; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD17:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4, !alias.scope !21 +; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4, !alias.scope !21 ; AVX512-NEXT: [[TMP16:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], -; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD15]], -; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD16]], -; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD17]], +; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], +; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD13]], +; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD14]], ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP0]] ; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP1]] ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP2]] @@ -983,21 +938,21 @@ ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef), !alias.scope !24 ; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 16 ; AVX512-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x float> undef), !alias.scope !24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x float> undef), !alias.scope !24 ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 32 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD19:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x float> undef), !alias.scope !24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x float> undef), !alias.scope !24 ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 48 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD20:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x float> undef), !alias.scope !24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x float> undef), !alias.scope !24 ; AVX512-NEXT: [[TMP32:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float> -; AVX512-NEXT: [[TMP33:%.*]] = sitofp <16 x i32> [[WIDE_LOAD15]] to <16 x float> -; AVX512-NEXT: [[TMP34:%.*]] = sitofp <16 x i32> [[WIDE_LOAD16]] to <16 x float> -; AVX512-NEXT: [[TMP35:%.*]] = sitofp <16 x i32> [[WIDE_LOAD17]] to <16 x float> +; AVX512-NEXT: [[TMP33:%.*]] = sitofp <16 x i32> [[WIDE_LOAD12]] to <16 x float> +; AVX512-NEXT: [[TMP34:%.*]] = sitofp <16 x i32> [[WIDE_LOAD13]] to <16 x float> +; AVX512-NEXT: [[TMP35:%.*]] = sitofp <16 x i32> [[WIDE_LOAD14]] to <16 x float> ; AVX512-NEXT: [[TMP36:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD]], [[TMP32]] -; AVX512-NEXT: [[TMP37:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD18]], [[TMP33]] -; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD19]], [[TMP34]] -; AVX512-NEXT: [[TMP39:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD20]], [[TMP35]] +; AVX512-NEXT: [[TMP37:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD15]], [[TMP33]] +; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD16]], [[TMP34]] +; AVX512-NEXT: [[TMP39:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD17]], [[TMP35]] ; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP0]] ; AVX512-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]] ; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]] @@ -1110,12 +1065,6 @@ ; AVX-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX: vector.body: ; AVX-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0 -; AVX-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer -; AVX-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX-NEXT: [[INDUCTION12:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX-NEXT: [[INDUCTION13:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX-NEXT: [[INDUCTION14:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], ; AVX-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; AVX-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; AVX-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 @@ -1129,17 +1078,17 @@ ; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !31 ; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 4 ; AVX-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* -; AVX-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !alias.scope !31 +; AVX-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !alias.scope !31 ; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8 ; AVX-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* -; AVX-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !31 +; AVX-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !31 ; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 12 ; AVX-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>* -; AVX-NEXT: [[WIDE_LOAD17:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15]], align 4, !alias.scope !31 +; AVX-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15]], align 4, !alias.scope !31 ; AVX-NEXT: [[TMP16:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], -; AVX-NEXT: [[TMP17:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD15]], -; AVX-NEXT: [[TMP18:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD16]], -; AVX-NEXT: [[TMP19:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD17]], +; AVX-NEXT: [[TMP17:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD12]], +; AVX-NEXT: [[TMP18:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD13]], +; AVX-NEXT: [[TMP19:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD14]], ; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP0]] ; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP1]] ; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP2]] @@ -1149,21 +1098,21 @@ ; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP25]], i32 8, <4 x i1> [[TMP16]], <4 x double> undef), !alias.scope !34 ; AVX-NEXT: [[TMP26:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 4 ; AVX-NEXT: [[TMP27:%.*]] = bitcast double* [[TMP26]] to <4 x double>* -; AVX-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP27]], i32 8, <4 x i1> [[TMP17]], <4 x double> undef), !alias.scope !34 +; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP27]], i32 8, <4 x i1> [[TMP17]], <4 x double> undef), !alias.scope !34 ; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 8 ; AVX-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>* -; AVX-NEXT: [[WIDE_MASKED_LOAD19:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP18]], <4 x double> undef), !alias.scope !34 +; AVX-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP18]], <4 x double> undef), !alias.scope !34 ; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 12 ; AVX-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>* -; AVX-NEXT: [[WIDE_MASKED_LOAD20:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP19]], <4 x double> undef), !alias.scope !34 +; AVX-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP19]], <4 x double> undef), !alias.scope !34 ; AVX-NEXT: [[TMP32:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double> -; AVX-NEXT: [[TMP33:%.*]] = sitofp <4 x i32> [[WIDE_LOAD15]] to <4 x double> -; AVX-NEXT: [[TMP34:%.*]] = sitofp <4 x i32> [[WIDE_LOAD16]] to <4 x double> -; AVX-NEXT: [[TMP35:%.*]] = sitofp <4 x i32> [[WIDE_LOAD17]] to <4 x double> +; AVX-NEXT: [[TMP33:%.*]] = sitofp <4 x i32> [[WIDE_LOAD12]] to <4 x double> +; AVX-NEXT: [[TMP34:%.*]] = sitofp <4 x i32> [[WIDE_LOAD13]] to <4 x double> +; AVX-NEXT: [[TMP35:%.*]] = sitofp <4 x i32> [[WIDE_LOAD14]] to <4 x double> ; AVX-NEXT: [[TMP36:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP32]] -; AVX-NEXT: [[TMP37:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD18]], [[TMP33]] -; AVX-NEXT: [[TMP38:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD19]], [[TMP34]] -; AVX-NEXT: [[TMP39:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD20]], [[TMP35]] +; AVX-NEXT: [[TMP37:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD15]], [[TMP33]] +; AVX-NEXT: [[TMP38:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD16]], [[TMP34]] +; AVX-NEXT: [[TMP39:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD17]], [[TMP35]] ; AVX-NEXT: [[TMP40:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]] ; AVX-NEXT: [[TMP41:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]] ; AVX-NEXT: [[TMP42:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP2]] @@ -1236,12 +1185,6 @@ ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0 -; AVX512-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer -; AVX512-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], -; AVX512-NEXT: [[INDUCTION12:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], -; AVX512-NEXT: [[INDUCTION13:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], -; AVX512-NEXT: [[INDUCTION14:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], ; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; AVX512-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 ; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 @@ -1255,17 +1198,17 @@ ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !31 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8 ; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD15:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4, !alias.scope !31 +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4, !alias.scope !31 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16 ; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD16:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !31 +; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !31 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 24 ; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD17:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4, !alias.scope !31 +; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4, !alias.scope !31 ; AVX512-NEXT: [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], -; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD15]], -; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD16]], -; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD17]], +; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], +; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], +; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD14]], ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP0]] ; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP1]] ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP2]] @@ -1275,21 +1218,21 @@ ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP25]], i32 8, <8 x i1> [[TMP16]], <8 x double> undef), !alias.scope !34 ; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 8 ; AVX512-NEXT: [[TMP27:%.*]] = bitcast double* [[TMP26]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP27]], i32 8, <8 x i1> [[TMP17]], <8 x double> undef), !alias.scope !34 +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP27]], i32 8, <8 x i1> [[TMP17]], <8 x double> undef), !alias.scope !34 ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 16 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD19:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP18]], <8 x double> undef), !alias.scope !34 +; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP18]], <8 x double> undef), !alias.scope !34 ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 24 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD20:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP19]], <8 x double> undef), !alias.scope !34 +; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP19]], <8 x double> undef), !alias.scope !34 ; AVX512-NEXT: [[TMP32:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x double> -; AVX512-NEXT: [[TMP33:%.*]] = sitofp <8 x i32> [[WIDE_LOAD15]] to <8 x double> -; AVX512-NEXT: [[TMP34:%.*]] = sitofp <8 x i32> [[WIDE_LOAD16]] to <8 x double> -; AVX512-NEXT: [[TMP35:%.*]] = sitofp <8 x i32> [[WIDE_LOAD17]] to <8 x double> +; AVX512-NEXT: [[TMP33:%.*]] = sitofp <8 x i32> [[WIDE_LOAD12]] to <8 x double> +; AVX512-NEXT: [[TMP34:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x double> +; AVX512-NEXT: [[TMP35:%.*]] = sitofp <8 x i32> [[WIDE_LOAD14]] to <8 x double> ; AVX512-NEXT: [[TMP36:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD]], [[TMP32]] -; AVX512-NEXT: [[TMP37:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD18]], [[TMP33]] -; AVX512-NEXT: [[TMP38:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD19]], [[TMP34]] -; AVX512-NEXT: [[TMP39:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD20]], [[TMP35]] +; AVX512-NEXT: [[TMP37:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD15]], [[TMP33]] +; AVX512-NEXT: [[TMP38:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD16]], [[TMP34]] +; AVX512-NEXT: [[TMP39:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD17]], [[TMP35]] ; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]] ; AVX512-NEXT: [[TMP41:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]] ; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP2]] @@ -1631,12 +1574,6 @@ ; AVX2: vector.body: ; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]] -; AVX2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[OFFSET_IDX]], i32 0 -; AVX2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer -; AVX2-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[INDUCTION12:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[INDUCTION13:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[INDUCTION14:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], ; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 ; AVX2-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -4 ; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -8 @@ -1653,78 +1590,78 @@ ; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -4 ; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 -3 ; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !41 -; AVX2-NEXT: [[REVERSE16:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD15]], <4 x i32> undef, <4 x i32> +; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !41 +; AVX2-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD12]], <4 x i32> undef, <4 x i32> ; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -8 ; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 -3 ; AVX2-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD17:%.*]] = load <4 x i32>, <4 x i32>* [[TMP16]], align 4, !alias.scope !41 -; AVX2-NEXT: [[REVERSE18:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD17]], <4 x i32> undef, <4 x i32> +; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP16]], align 4, !alias.scope !41 +; AVX2-NEXT: [[REVERSE15:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD14]], <4 x i32> undef, <4 x i32> ; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -12 ; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 -3 ; AVX2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* -; AVX2-NEXT: [[WIDE_LOAD19:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4, !alias.scope !41 -; AVX2-NEXT: [[REVERSE20:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD19]], <4 x i32> undef, <4 x i32> +; AVX2-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4, !alias.scope !41 +; AVX2-NEXT: [[REVERSE17:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD16]], <4 x i32> undef, <4 x i32> ; AVX2-NEXT: [[TMP20:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer -; AVX2-NEXT: [[TMP21:%.*]] = icmp sgt <4 x i32> [[REVERSE16]], zeroinitializer -; AVX2-NEXT: [[TMP22:%.*]] = icmp sgt <4 x i32> [[REVERSE18]], zeroinitializer -; AVX2-NEXT: [[TMP23:%.*]] = icmp sgt <4 x i32> [[REVERSE20]], zeroinitializer +; AVX2-NEXT: [[TMP21:%.*]] = icmp sgt <4 x i32> [[REVERSE13]], zeroinitializer +; AVX2-NEXT: [[TMP22:%.*]] = icmp sgt <4 x i32> [[REVERSE15]], zeroinitializer +; AVX2-NEXT: [[TMP23:%.*]] = icmp sgt <4 x i32> [[REVERSE17]], zeroinitializer ; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP0]] ; AVX2-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP1]] ; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP2]] ; AVX2-NEXT: [[TMP27:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP3]] ; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 0 ; AVX2-NEXT: [[TMP29:%.*]] = getelementptr inbounds double, double* [[TMP28]], i32 -3 -; AVX2-NEXT: [[REVERSE21:%.*]] = shufflevector <4 x i1> [[TMP20]], <4 x i1> undef, <4 x i32> +; AVX2-NEXT: [[REVERSE18:%.*]] = shufflevector <4 x i1> [[TMP20]], <4 x i1> undef, <4 x i32> ; AVX2-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP30]], i32 8, <4 x i1> [[REVERSE21]], <4 x double> undef), !alias.scope !44 -; AVX2-NEXT: [[REVERSE22:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> undef, <4 x i32> +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP30]], i32 8, <4 x i1> [[REVERSE18]], <4 x double> undef), !alias.scope !44 +; AVX2-NEXT: [[REVERSE19:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> undef, <4 x i32> ; AVX2-NEXT: [[TMP31:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 -4 ; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP31]], i32 -3 -; AVX2-NEXT: [[REVERSE23:%.*]] = shufflevector <4 x i1> [[TMP21]], <4 x i1> undef, <4 x i32> +; AVX2-NEXT: [[REVERSE20:%.*]] = shufflevector <4 x i1> [[TMP21]], <4 x i1> undef, <4 x i32> ; AVX2-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP33]], i32 8, <4 x i1> [[REVERSE23]], <4 x double> undef), !alias.scope !44 -; AVX2-NEXT: [[REVERSE25:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD24]], <4 x double> undef, <4 x i32> +; AVX2-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP33]], i32 8, <4 x i1> [[REVERSE20]], <4 x double> undef), !alias.scope !44 +; AVX2-NEXT: [[REVERSE22:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD21]], <4 x double> undef, <4 x i32> ; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 -8 ; AVX2-NEXT: [[TMP35:%.*]] = getelementptr inbounds double, double* [[TMP34]], i32 -3 -; AVX2-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP22]], <4 x i1> undef, <4 x i32> +; AVX2-NEXT: [[REVERSE23:%.*]] = shufflevector <4 x i1> [[TMP22]], <4 x i1> undef, <4 x i32> ; AVX2-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> undef), !alias.scope !44 -; AVX2-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD27]], <4 x double> undef, <4 x i32> +; AVX2-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE23]], <4 x double> undef), !alias.scope !44 +; AVX2-NEXT: [[REVERSE25:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD24]], <4 x double> undef, <4 x i32> ; AVX2-NEXT: [[TMP37:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 -12 ; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP37]], i32 -3 -; AVX2-NEXT: [[REVERSE29:%.*]] = shufflevector <4 x i1> [[TMP23]], <4 x i1> undef, <4 x i32> +; AVX2-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP23]], <4 x i1> undef, <4 x i32> ; AVX2-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD30:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE29]], <4 x double> undef), !alias.scope !44 -; AVX2-NEXT: [[REVERSE31:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD30]], <4 x double> undef, <4 x i32> -; AVX2-NEXT: [[TMP40:%.*]] = fadd <4 x double> [[REVERSE22]], -; AVX2-NEXT: [[TMP41:%.*]] = fadd <4 x double> [[REVERSE25]], -; AVX2-NEXT: [[TMP42:%.*]] = fadd <4 x double> [[REVERSE28]], -; AVX2-NEXT: [[TMP43:%.*]] = fadd <4 x double> [[REVERSE31]], +; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> undef), !alias.scope !44 +; AVX2-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD27]], <4 x double> undef, <4 x i32> +; AVX2-NEXT: [[TMP40:%.*]] = fadd <4 x double> [[REVERSE19]], +; AVX2-NEXT: [[TMP41:%.*]] = fadd <4 x double> [[REVERSE22]], +; AVX2-NEXT: [[TMP42:%.*]] = fadd <4 x double> [[REVERSE25]], +; AVX2-NEXT: [[TMP43:%.*]] = fadd <4 x double> [[REVERSE28]], ; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP0]] ; AVX2-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]] ; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]] ; AVX2-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP3]] -; AVX2-NEXT: [[REVERSE32:%.*]] = shufflevector <4 x double> [[TMP40]], <4 x double> undef, <4 x i32> +; AVX2-NEXT: [[REVERSE29:%.*]] = shufflevector <4 x double> [[TMP40]], <4 x double> undef, <4 x i32> ; AVX2-NEXT: [[TMP48:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0 ; AVX2-NEXT: [[TMP49:%.*]] = getelementptr inbounds double, double* [[TMP48]], i32 -3 ; AVX2-NEXT: [[TMP50:%.*]] = bitcast double* [[TMP49]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE32]], <4 x double>* [[TMP50]], i32 8, <4 x i1> [[REVERSE21]]), !alias.scope !46, !noalias !48 -; AVX2-NEXT: [[REVERSE34:%.*]] = shufflevector <4 x double> [[TMP41]], <4 x double> undef, <4 x i32> +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE29]], <4 x double>* [[TMP50]], i32 8, <4 x i1> [[REVERSE18]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: [[REVERSE31:%.*]] = shufflevector <4 x double> [[TMP41]], <4 x double> undef, <4 x i32> ; AVX2-NEXT: [[TMP51:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 -4 ; AVX2-NEXT: [[TMP52:%.*]] = getelementptr inbounds double, double* [[TMP51]], i32 -3 ; AVX2-NEXT: [[TMP53:%.*]] = bitcast double* [[TMP52]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE34]], <4 x double>* [[TMP53]], i32 8, <4 x i1> [[REVERSE23]]), !alias.scope !46, !noalias !48 -; AVX2-NEXT: [[REVERSE36:%.*]] = shufflevector <4 x double> [[TMP42]], <4 x double> undef, <4 x i32> +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE31]], <4 x double>* [[TMP53]], i32 8, <4 x i1> [[REVERSE20]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: [[REVERSE33:%.*]] = shufflevector <4 x double> [[TMP42]], <4 x double> undef, <4 x i32> ; AVX2-NEXT: [[TMP54:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 -8 ; AVX2-NEXT: [[TMP55:%.*]] = getelementptr inbounds double, double* [[TMP54]], i32 -3 ; AVX2-NEXT: [[TMP56:%.*]] = bitcast double* [[TMP55]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE36]], <4 x double>* [[TMP56]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48 -; AVX2-NEXT: [[REVERSE38:%.*]] = shufflevector <4 x double> [[TMP43]], <4 x double> undef, <4 x i32> +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE33]], <4 x double>* [[TMP56]], i32 8, <4 x i1> [[REVERSE23]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: [[REVERSE35:%.*]] = shufflevector <4 x double> [[TMP43]], <4 x double> undef, <4 x i32> ; AVX2-NEXT: [[TMP57:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 -12 ; AVX2-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP57]], i32 -3 ; AVX2-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <4 x double>* -; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE38]], <4 x double>* [[TMP59]], i32 8, <4 x i1> [[REVERSE29]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE35]], <4 x double>* [[TMP59]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48 ; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; AVX2-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !49 @@ -1781,12 +1718,6 @@ ; AVX512: vector.body: ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]] -; AVX512-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[OFFSET_IDX]], i32 0 -; AVX512-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer -; AVX512-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], -; AVX512-NEXT: [[INDUCTION12:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], -; AVX512-NEXT: [[INDUCTION13:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], -; AVX512-NEXT: [[INDUCTION14:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], ; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 ; AVX512-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -8 ; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -16 @@ -1803,78 +1734,78 @@ ; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -8 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 -7 ; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD15:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !51 -; AVX512-NEXT: [[REVERSE16:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD15]], <8 x i32> undef, <8 x i32> +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !51 +; AVX512-NEXT: [[REVERSE13:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD12]], <8 x i32> undef, <8 x i32> ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -16 ; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 -7 ; AVX512-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD17:%.*]] = load <8 x i32>, <8 x i32>* [[TMP16]], align 4, !alias.scope !51 -; AVX512-NEXT: [[REVERSE18:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD17]], <8 x i32> undef, <8 x i32> +; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP16]], align 4, !alias.scope !51 +; AVX512-NEXT: [[REVERSE15:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD14]], <8 x i32> undef, <8 x i32> ; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -24 ; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 -7 ; AVX512-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD19:%.*]] = load <8 x i32>, <8 x i32>* [[TMP19]], align 4, !alias.scope !51 -; AVX512-NEXT: [[REVERSE20:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD19]], <8 x i32> undef, <8 x i32> +; AVX512-NEXT: [[WIDE_LOAD16:%.*]] = load <8 x i32>, <8 x i32>* [[TMP19]], align 4, !alias.scope !51 +; AVX512-NEXT: [[REVERSE17:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD16]], <8 x i32> undef, <8 x i32> ; AVX512-NEXT: [[TMP20:%.*]] = icmp sgt <8 x i32> [[REVERSE]], zeroinitializer -; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <8 x i32> [[REVERSE16]], zeroinitializer -; AVX512-NEXT: [[TMP22:%.*]] = icmp sgt <8 x i32> [[REVERSE18]], zeroinitializer -; AVX512-NEXT: [[TMP23:%.*]] = icmp sgt <8 x i32> [[REVERSE20]], zeroinitializer +; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <8 x i32> [[REVERSE13]], zeroinitializer +; AVX512-NEXT: [[TMP22:%.*]] = icmp sgt <8 x i32> [[REVERSE15]], zeroinitializer +; AVX512-NEXT: [[TMP23:%.*]] = icmp sgt <8 x i32> [[REVERSE17]], zeroinitializer ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP0]] ; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP1]] ; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP2]] ; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 0 ; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds double, double* [[TMP28]], i32 -7 -; AVX512-NEXT: [[REVERSE21:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> undef, <8 x i32> +; AVX512-NEXT: [[REVERSE18:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> undef, <8 x i32> ; AVX512-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP30]], i32 8, <8 x i1> [[REVERSE21]], <8 x double> undef), !alias.scope !54 -; AVX512-NEXT: [[REVERSE22:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD]], <8 x double> undef, <8 x i32> +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP30]], i32 8, <8 x i1> [[REVERSE18]], <8 x double> undef), !alias.scope !54 +; AVX512-NEXT: [[REVERSE19:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD]], <8 x double> undef, <8 x i32> ; AVX512-NEXT: [[TMP31:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 -8 ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP31]], i32 -7 -; AVX512-NEXT: [[REVERSE23:%.*]] = shufflevector <8 x i1> [[TMP21]], <8 x i1> undef, <8 x i32> +; AVX512-NEXT: [[REVERSE20:%.*]] = shufflevector <8 x i1> [[TMP21]], <8 x i1> undef, <8 x i32> ; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP33]], i32 8, <8 x i1> [[REVERSE23]], <8 x double> undef), !alias.scope !54 -; AVX512-NEXT: [[REVERSE25:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD24]], <8 x double> undef, <8 x i32> +; AVX512-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP33]], i32 8, <8 x i1> [[REVERSE20]], <8 x double> undef), !alias.scope !54 +; AVX512-NEXT: [[REVERSE22:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD21]], <8 x double> undef, <8 x i32> ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 -16 ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds double, double* [[TMP34]], i32 -7 -; AVX512-NEXT: [[REVERSE26:%.*]] = shufflevector <8 x i1> [[TMP22]], <8 x i1> undef, <8 x i32> +; AVX512-NEXT: [[REVERSE23:%.*]] = shufflevector <8 x i1> [[TMP22]], <8 x i1> undef, <8 x i32> ; AVX512-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP36]], i32 8, <8 x i1> [[REVERSE26]], <8 x double> undef), !alias.scope !54 -; AVX512-NEXT: [[REVERSE28:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD27]], <8 x double> undef, <8 x i32> +; AVX512-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP36]], i32 8, <8 x i1> [[REVERSE23]], <8 x double> undef), !alias.scope !54 +; AVX512-NEXT: [[REVERSE25:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD24]], <8 x double> undef, <8 x i32> ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 -24 ; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP37]], i32 -7 -; AVX512-NEXT: [[REVERSE29:%.*]] = shufflevector <8 x i1> [[TMP23]], <8 x i1> undef, <8 x i32> +; AVX512-NEXT: [[REVERSE26:%.*]] = shufflevector <8 x i1> [[TMP23]], <8 x i1> undef, <8 x i32> ; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD30:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP39]], i32 8, <8 x i1> [[REVERSE29]], <8 x double> undef), !alias.scope !54 -; AVX512-NEXT: [[REVERSE31:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD30]], <8 x double> undef, <8 x i32> -; AVX512-NEXT: [[TMP40:%.*]] = fadd <8 x double> [[REVERSE22]], -; AVX512-NEXT: [[TMP41:%.*]] = fadd <8 x double> [[REVERSE25]], -; AVX512-NEXT: [[TMP42:%.*]] = fadd <8 x double> [[REVERSE28]], -; AVX512-NEXT: [[TMP43:%.*]] = fadd <8 x double> [[REVERSE31]], +; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP39]], i32 8, <8 x i1> [[REVERSE26]], <8 x double> undef), !alias.scope !54 +; AVX512-NEXT: [[REVERSE28:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD27]], <8 x double> undef, <8 x i32> +; AVX512-NEXT: [[TMP40:%.*]] = fadd <8 x double> [[REVERSE19]], +; AVX512-NEXT: [[TMP41:%.*]] = fadd <8 x double> [[REVERSE22]], +; AVX512-NEXT: [[TMP42:%.*]] = fadd <8 x double> [[REVERSE25]], +; AVX512-NEXT: [[TMP43:%.*]] = fadd <8 x double> [[REVERSE28]], ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP0]] ; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]] ; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]] ; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP3]] -; AVX512-NEXT: [[REVERSE32:%.*]] = shufflevector <8 x double> [[TMP40]], <8 x double> undef, <8 x i32> +; AVX512-NEXT: [[REVERSE29:%.*]] = shufflevector <8 x double> [[TMP40]], <8 x double> undef, <8 x i32> ; AVX512-NEXT: [[TMP48:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0 ; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds double, double* [[TMP48]], i32 -7 ; AVX512-NEXT: [[TMP50:%.*]] = bitcast double* [[TMP49]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE32]], <8 x double>* [[TMP50]], i32 8, <8 x i1> [[REVERSE21]]), !alias.scope !56, !noalias !58 -; AVX512-NEXT: [[REVERSE34:%.*]] = shufflevector <8 x double> [[TMP41]], <8 x double> undef, <8 x i32> +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE29]], <8 x double>* [[TMP50]], i32 8, <8 x i1> [[REVERSE18]]), !alias.scope !56, !noalias !58 +; AVX512-NEXT: [[REVERSE31:%.*]] = shufflevector <8 x double> [[TMP41]], <8 x double> undef, <8 x i32> ; AVX512-NEXT: [[TMP51:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 -8 ; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds double, double* [[TMP51]], i32 -7 ; AVX512-NEXT: [[TMP53:%.*]] = bitcast double* [[TMP52]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE34]], <8 x double>* [[TMP53]], i32 8, <8 x i1> [[REVERSE23]]), !alias.scope !56, !noalias !58 -; AVX512-NEXT: [[REVERSE36:%.*]] = shufflevector <8 x double> [[TMP42]], <8 x double> undef, <8 x i32> +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE31]], <8 x double>* [[TMP53]], i32 8, <8 x i1> [[REVERSE20]]), !alias.scope !56, !noalias !58 +; AVX512-NEXT: [[REVERSE33:%.*]] = shufflevector <8 x double> [[TMP42]], <8 x double> undef, <8 x i32> ; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 -16 ; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds double, double* [[TMP54]], i32 -7 ; AVX512-NEXT: [[TMP56:%.*]] = bitcast double* [[TMP55]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE36]], <8 x double>* [[TMP56]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !56, !noalias !58 -; AVX512-NEXT: [[REVERSE38:%.*]] = shufflevector <8 x double> [[TMP43]], <8 x double> undef, <8 x i32> +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE33]], <8 x double>* [[TMP56]], i32 8, <8 x i1> [[REVERSE23]]), !alias.scope !56, !noalias !58 +; AVX512-NEXT: [[REVERSE35:%.*]] = shufflevector <8 x double> [[TMP43]], <8 x double> undef, <8 x i32> ; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 -24 ; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP57]], i32 -7 ; AVX512-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE38]], <8 x double>* [[TMP59]], i32 8, <8 x i1> [[REVERSE29]]), !alias.scope !56, !noalias !58 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE35]], <8 x double>* [[TMP59]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !56, !noalias !58 ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 ; AVX512-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !59 @@ -1954,12 +1885,6 @@ ; AVX1-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX1: vector.body: ; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0 -; AVX1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer -; AVX1-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX1-NEXT: [[INDUCTION1:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX1-NEXT: [[INDUCTION2:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX1-NEXT: [[INDUCTION3:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], ; AVX1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; AVX1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; AVX1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 @@ -1973,17 +1898,17 @@ ; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1 ; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4 ; AVX1-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1 +; AVX1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1 ; AVX1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8 ; AVX1-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1 +; AVX1-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1 ; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12 ; AVX1-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1 +; AVX1-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1 ; AVX1-NEXT: [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], -; AVX1-NEXT: [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD4]], -; AVX1-NEXT: [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD5]], -; AVX1-NEXT: [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD6]], +; AVX1-NEXT: [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD1]], +; AVX1-NEXT: [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD2]], +; AVX1-NEXT: [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD3]], ; AVX1-NEXT: [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer ; AVX1-NEXT: [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer ; AVX1-NEXT: [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer @@ -2001,17 +1926,17 @@ ; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x double*> undef) ; AVX1-NEXT: [[TMP34:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 4 ; AVX1-NEXT: [[TMP35:%.*]] = bitcast double** [[TMP34]] to <4 x double*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x double*> undef) +; AVX1-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x double*> undef) ; AVX1-NEXT: [[TMP36:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 8 ; AVX1-NEXT: [[TMP37:%.*]] = bitcast double** [[TMP36]] to <4 x double*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x double*> undef) +; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x double*> undef) ; AVX1-NEXT: [[TMP38:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 12 ; AVX1-NEXT: [[TMP39:%.*]] = bitcast double** [[TMP38]] to <4 x double*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x double*> undef) +; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x double*> undef) ; AVX1-NEXT: [[TMP40:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX1-NEXT: [[TMP41:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD7]], zeroinitializer -; AVX1-NEXT: [[TMP42:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD8]], zeroinitializer -; AVX1-NEXT: [[TMP43:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD9]], zeroinitializer +; AVX1-NEXT: [[TMP41:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD4]], zeroinitializer +; AVX1-NEXT: [[TMP42:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD5]], zeroinitializer +; AVX1-NEXT: [[TMP43:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD6]], zeroinitializer ; AVX1-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]] ; AVX1-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]] @@ -2084,12 +2009,6 @@ ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX2: vector.body: ; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0 -; AVX2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer -; AVX2-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[INDUCTION1:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[INDUCTION2:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[INDUCTION3:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], ; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; AVX2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 @@ -2103,17 +2022,17 @@ ; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1 ; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4 ; AVX2-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>* -; AVX2-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1 +; AVX2-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1 ; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8 ; AVX2-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>* -; AVX2-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1 +; AVX2-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1 ; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12 ; AVX2-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>* -; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1 +; AVX2-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1 ; AVX2-NEXT: [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], -; AVX2-NEXT: [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD4]], -; AVX2-NEXT: [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD5]], -; AVX2-NEXT: [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD6]], +; AVX2-NEXT: [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD1]], +; AVX2-NEXT: [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD2]], +; AVX2-NEXT: [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD3]], ; AVX2-NEXT: [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer ; AVX2-NEXT: [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer ; AVX2-NEXT: [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer @@ -2131,17 +2050,17 @@ ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x double*> undef) ; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 4 ; AVX2-NEXT: [[TMP35:%.*]] = bitcast double** [[TMP34]] to <4 x double*>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x double*> undef) +; AVX2-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x double*> undef) ; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 8 ; AVX2-NEXT: [[TMP37:%.*]] = bitcast double** [[TMP36]] to <4 x double*>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x double*> undef) +; AVX2-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x double*> undef) ; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 12 ; AVX2-NEXT: [[TMP39:%.*]] = bitcast double** [[TMP38]] to <4 x double*>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x double*> undef) +; AVX2-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x double*> undef) ; AVX2-NEXT: [[TMP40:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX2-NEXT: [[TMP41:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD7]], zeroinitializer -; AVX2-NEXT: [[TMP42:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD8]], zeroinitializer -; AVX2-NEXT: [[TMP43:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD9]], zeroinitializer +; AVX2-NEXT: [[TMP41:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD4]], zeroinitializer +; AVX2-NEXT: [[TMP42:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD5]], zeroinitializer +; AVX2-NEXT: [[TMP43:%.*]] = icmp eq <4 x double*> [[WIDE_MASKED_LOAD6]], zeroinitializer ; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]] ; AVX2-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]] ; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]] @@ -2214,12 +2133,6 @@ ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0 -; AVX512-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer -; AVX512-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], -; AVX512-NEXT: [[INDUCTION1:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], -; AVX512-NEXT: [[INDUCTION2:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], -; AVX512-NEXT: [[INDUCTION3:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], ; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; AVX512-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 ; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 @@ -2233,17 +2146,17 @@ ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP9]], align 1 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8 ; AVX512-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>* -; AVX512-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP11]], align 1 +; AVX512-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP11]], align 1 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 16 ; AVX512-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <8 x i8>* -; AVX512-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i8>, <8 x i8>* [[TMP13]], align 1 +; AVX512-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP13]], align 1 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 24 ; AVX512-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <8 x i8>* -; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i8>, <8 x i8>* [[TMP15]], align 1 +; AVX512-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP15]], align 1 ; AVX512-NEXT: [[TMP16:%.*]] = and <8 x i8> [[WIDE_LOAD]], -; AVX512-NEXT: [[TMP17:%.*]] = and <8 x i8> [[WIDE_LOAD4]], -; AVX512-NEXT: [[TMP18:%.*]] = and <8 x i8> [[WIDE_LOAD5]], -; AVX512-NEXT: [[TMP19:%.*]] = and <8 x i8> [[WIDE_LOAD6]], +; AVX512-NEXT: [[TMP17:%.*]] = and <8 x i8> [[WIDE_LOAD1]], +; AVX512-NEXT: [[TMP18:%.*]] = and <8 x i8> [[WIDE_LOAD2]], +; AVX512-NEXT: [[TMP19:%.*]] = and <8 x i8> [[WIDE_LOAD3]], ; AVX512-NEXT: [[TMP20:%.*]] = icmp eq <8 x i8> [[TMP16]], zeroinitializer ; AVX512-NEXT: [[TMP21:%.*]] = icmp eq <8 x i8> [[TMP17]], zeroinitializer ; AVX512-NEXT: [[TMP22:%.*]] = icmp eq <8 x i8> [[TMP18]], zeroinitializer @@ -2261,17 +2174,17 @@ ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP33]], i32 8, <8 x i1> [[TMP28]], <8 x double*> undef) ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 8 ; AVX512-NEXT: [[TMP35:%.*]] = bitcast double** [[TMP34]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP35]], i32 8, <8 x i1> [[TMP29]], <8 x double*> undef) +; AVX512-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP35]], i32 8, <8 x i1> [[TMP29]], <8 x double*> undef) ; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 16 ; AVX512-NEXT: [[TMP37:%.*]] = bitcast double** [[TMP36]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP37]], i32 8, <8 x i1> [[TMP30]], <8 x double*> undef) +; AVX512-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP37]], i32 8, <8 x i1> [[TMP30]], <8 x double*> undef) ; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds double*, double** [[TMP24]], i32 24 ; AVX512-NEXT: [[TMP39:%.*]] = bitcast double** [[TMP38]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP39]], i32 8, <8 x i1> [[TMP31]], <8 x double*> undef) +; AVX512-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP39]], i32 8, <8 x i1> [[TMP31]], <8 x double*> undef) ; AVX512-NEXT: [[TMP40:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX512-NEXT: [[TMP41:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD7]], zeroinitializer -; AVX512-NEXT: [[TMP42:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD8]], zeroinitializer -; AVX512-NEXT: [[TMP43:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD9]], zeroinitializer +; AVX512-NEXT: [[TMP41:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD4]], zeroinitializer +; AVX512-NEXT: [[TMP42:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD5]], zeroinitializer +; AVX512-NEXT: [[TMP43:%.*]] = icmp eq <8 x double*> [[WIDE_MASKED_LOAD6]], zeroinitializer ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]] ; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]] ; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]] @@ -2389,12 +2302,6 @@ ; AVX1-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX1: vector.body: ; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0 -; AVX1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer -; AVX1-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX1-NEXT: [[INDUCTION1:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX1-NEXT: [[INDUCTION2:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX1-NEXT: [[INDUCTION3:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], ; AVX1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; AVX1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; AVX1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 @@ -2408,17 +2315,17 @@ ; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1 ; AVX1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4 ; AVX1-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1 +; AVX1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1 ; AVX1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8 ; AVX1-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1 +; AVX1-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1 ; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12 ; AVX1-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>* -; AVX1-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1 +; AVX1-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1 ; AVX1-NEXT: [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], -; AVX1-NEXT: [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD4]], -; AVX1-NEXT: [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD5]], -; AVX1-NEXT: [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD6]], +; AVX1-NEXT: [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD1]], +; AVX1-NEXT: [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD2]], +; AVX1-NEXT: [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD3]], ; AVX1-NEXT: [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer ; AVX1-NEXT: [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer ; AVX1-NEXT: [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer @@ -2436,17 +2343,17 @@ ; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x i32 ()*> undef) ; AVX1-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 4 ; AVX1-NEXT: [[TMP35:%.*]] = bitcast i32 ()** [[TMP34]] to <4 x i32 ()*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x i32 ()*> undef) +; AVX1-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x i32 ()*> undef) ; AVX1-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 8 ; AVX1-NEXT: [[TMP37:%.*]] = bitcast i32 ()** [[TMP36]] to <4 x i32 ()*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x i32 ()*> undef) +; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x i32 ()*> undef) ; AVX1-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 12 ; AVX1-NEXT: [[TMP39:%.*]] = bitcast i32 ()** [[TMP38]] to <4 x i32 ()*>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x i32 ()*> undef) +; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x i32 ()*> undef) ; AVX1-NEXT: [[TMP40:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX1-NEXT: [[TMP41:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD7]], zeroinitializer -; AVX1-NEXT: [[TMP42:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD8]], zeroinitializer -; AVX1-NEXT: [[TMP43:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD9]], zeroinitializer +; AVX1-NEXT: [[TMP41:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD4]], zeroinitializer +; AVX1-NEXT: [[TMP42:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD5]], zeroinitializer +; AVX1-NEXT: [[TMP43:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD6]], zeroinitializer ; AVX1-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]] ; AVX1-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]] @@ -2519,12 +2426,6 @@ ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX2: vector.body: ; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0 -; AVX2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer -; AVX2-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[INDUCTION1:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[INDUCTION2:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; AVX2-NEXT: [[INDUCTION3:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], ; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; AVX2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 ; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 @@ -2538,17 +2439,17 @@ ; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1 ; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 4 ; AVX2-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <4 x i8>* -; AVX2-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1 +; AVX2-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP11]], align 1 ; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8 ; AVX2-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>* -; AVX2-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1 +; AVX2-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1 ; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 12 ; AVX2-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>* -; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1 +; AVX2-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1 ; AVX2-NEXT: [[TMP16:%.*]] = and <4 x i8> [[WIDE_LOAD]], -; AVX2-NEXT: [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD4]], -; AVX2-NEXT: [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD5]], -; AVX2-NEXT: [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD6]], +; AVX2-NEXT: [[TMP17:%.*]] = and <4 x i8> [[WIDE_LOAD1]], +; AVX2-NEXT: [[TMP18:%.*]] = and <4 x i8> [[WIDE_LOAD2]], +; AVX2-NEXT: [[TMP19:%.*]] = and <4 x i8> [[WIDE_LOAD3]], ; AVX2-NEXT: [[TMP20:%.*]] = icmp eq <4 x i8> [[TMP16]], zeroinitializer ; AVX2-NEXT: [[TMP21:%.*]] = icmp eq <4 x i8> [[TMP17]], zeroinitializer ; AVX2-NEXT: [[TMP22:%.*]] = icmp eq <4 x i8> [[TMP18]], zeroinitializer @@ -2566,17 +2467,17 @@ ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP33]], i32 8, <4 x i1> [[TMP28]], <4 x i32 ()*> undef) ; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 4 ; AVX2-NEXT: [[TMP35:%.*]] = bitcast i32 ()** [[TMP34]] to <4 x i32 ()*>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x i32 ()*> undef) +; AVX2-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP35]], i32 8, <4 x i1> [[TMP29]], <4 x i32 ()*> undef) ; AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 8 ; AVX2-NEXT: [[TMP37:%.*]] = bitcast i32 ()** [[TMP36]] to <4 x i32 ()*>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x i32 ()*> undef) +; AVX2-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP37]], i32 8, <4 x i1> [[TMP30]], <4 x i32 ()*> undef) ; AVX2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 12 ; AVX2-NEXT: [[TMP39:%.*]] = bitcast i32 ()** [[TMP38]] to <4 x i32 ()*>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x i32 ()*> undef) +; AVX2-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP39]], i32 8, <4 x i1> [[TMP31]], <4 x i32 ()*> undef) ; AVX2-NEXT: [[TMP40:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX2-NEXT: [[TMP41:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD7]], zeroinitializer -; AVX2-NEXT: [[TMP42:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD8]], zeroinitializer -; AVX2-NEXT: [[TMP43:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD9]], zeroinitializer +; AVX2-NEXT: [[TMP41:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD4]], zeroinitializer +; AVX2-NEXT: [[TMP42:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD5]], zeroinitializer +; AVX2-NEXT: [[TMP43:%.*]] = icmp eq <4 x i32 ()*> [[WIDE_MASKED_LOAD6]], zeroinitializer ; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]] ; AVX2-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]] ; AVX2-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]] @@ -2649,12 +2550,6 @@ ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0 -; AVX512-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer -; AVX512-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], -; AVX512-NEXT: [[INDUCTION1:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], -; AVX512-NEXT: [[INDUCTION2:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], -; AVX512-NEXT: [[INDUCTION3:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], ; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; AVX512-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 ; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 @@ -2668,17 +2563,17 @@ ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP9]], align 1 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 8 ; AVX512-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>* -; AVX512-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP11]], align 1 +; AVX512-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP11]], align 1 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 16 ; AVX512-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <8 x i8>* -; AVX512-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i8>, <8 x i8>* [[TMP13]], align 1 +; AVX512-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP13]], align 1 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 24 ; AVX512-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <8 x i8>* -; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i8>, <8 x i8>* [[TMP15]], align 1 +; AVX512-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP15]], align 1 ; AVX512-NEXT: [[TMP16:%.*]] = and <8 x i8> [[WIDE_LOAD]], -; AVX512-NEXT: [[TMP17:%.*]] = and <8 x i8> [[WIDE_LOAD4]], -; AVX512-NEXT: [[TMP18:%.*]] = and <8 x i8> [[WIDE_LOAD5]], -; AVX512-NEXT: [[TMP19:%.*]] = and <8 x i8> [[WIDE_LOAD6]], +; AVX512-NEXT: [[TMP17:%.*]] = and <8 x i8> [[WIDE_LOAD1]], +; AVX512-NEXT: [[TMP18:%.*]] = and <8 x i8> [[WIDE_LOAD2]], +; AVX512-NEXT: [[TMP19:%.*]] = and <8 x i8> [[WIDE_LOAD3]], ; AVX512-NEXT: [[TMP20:%.*]] = icmp eq <8 x i8> [[TMP16]], zeroinitializer ; AVX512-NEXT: [[TMP21:%.*]] = icmp eq <8 x i8> [[TMP17]], zeroinitializer ; AVX512-NEXT: [[TMP22:%.*]] = icmp eq <8 x i8> [[TMP18]], zeroinitializer @@ -2696,17 +2591,17 @@ ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP33]], i32 8, <8 x i1> [[TMP28]], <8 x i32 ()*> undef) ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 8 ; AVX512-NEXT: [[TMP35:%.*]] = bitcast i32 ()** [[TMP34]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP35]], i32 8, <8 x i1> [[TMP29]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP35]], i32 8, <8 x i1> [[TMP29]], <8 x i32 ()*> undef) ; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 16 ; AVX512-NEXT: [[TMP37:%.*]] = bitcast i32 ()** [[TMP36]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP37]], i32 8, <8 x i1> [[TMP30]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP37]], i32 8, <8 x i1> [[TMP30]], <8 x i32 ()*> undef) ; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP24]], i32 24 ; AVX512-NEXT: [[TMP39:%.*]] = bitcast i32 ()** [[TMP38]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP39]], i32 8, <8 x i1> [[TMP31]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP39]], i32 8, <8 x i1> [[TMP31]], <8 x i32 ()*> undef) ; AVX512-NEXT: [[TMP40:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX512-NEXT: [[TMP41:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD7]], zeroinitializer -; AVX512-NEXT: [[TMP42:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD8]], zeroinitializer -; AVX512-NEXT: [[TMP43:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD9]], zeroinitializer +; AVX512-NEXT: [[TMP41:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD4]], zeroinitializer +; AVX512-NEXT: [[TMP42:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD5]], zeroinitializer +; AVX512-NEXT: [[TMP43:%.*]] = icmp eq <8 x i32 ()*> [[WIDE_MASKED_LOAD6]], zeroinitializer ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[TMP0]] ; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP1]] ; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[TMP2]] Index: llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll +++ llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll @@ -19,115 +19,115 @@ define i32 @enabled(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %N) { ; O1-LABEL: @enabled( ; O1-NEXT: entry: -; O1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 -; O1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer +; O1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 +; O1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer ; O1-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* ; O1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; O1-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]] +; O1-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; O1-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* ; O1-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 ; O1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 ; O1-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* ; O1-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; O1-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]] +; O1-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] ; O1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 ; O1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* ; O1-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 ; O1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 ; O1-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* ; O1-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; O1-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]] +; O1-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] ; O1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 ; O1-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* ; O1-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 ; O1-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 ; O1-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* ; O1-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; O1-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]] +; O1-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] ; O1-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 ; O1-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* ; O1-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 ; O1-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 ; O1-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* ; O1-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; O1-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]] +; O1-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] ; O1-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 ; O1-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* ; O1-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 ; O1-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 ; O1-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* ; O1-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; O1-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]] +; O1-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] ; O1-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 ; O1-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* ; O1-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 ; O1-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 ; O1-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* ; O1-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; O1-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]] +; O1-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] ; O1-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 ; O1-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* ; O1-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 ; O1-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 ; O1-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* ; O1-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; O1-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]] +; O1-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] ; O1-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 ; O1-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* ; O1-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 ; O1-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 ; O1-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* ; O1-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; O1-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]] +; O1-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] ; O1-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 ; O1-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* ; O1-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 ; O1-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 ; O1-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* ; O1-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; O1-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]] +; O1-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] ; O1-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 ; O1-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* ; O1-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 ; O1-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 ; O1-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* ; O1-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; O1-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]] +; O1-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] ; O1-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 ; O1-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* ; O1-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 ; O1-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 ; O1-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* ; O1-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; O1-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]] +; O1-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] ; O1-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 ; O1-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* ; O1-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 ; O1-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 ; O1-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* ; O1-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; O1-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]] +; O1-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] ; O1-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 ; O1-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* ; O1-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 ; O1-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 ; O1-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* ; O1-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; O1-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]] +; O1-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] ; O1-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 ; O1-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* ; O1-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 ; O1-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 ; O1-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* ; O1-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; O1-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]] +; O1-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] ; O1-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 ; O1-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* ; O1-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 ; O1-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 ; O1-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* ; O1-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; O1-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]] +; O1-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] ; O1-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 ; O1-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* ; O1-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 @@ -136,115 +136,115 @@ ; ; O2-LABEL: @enabled( ; O2-NEXT: entry: -; O2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 -; O2-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer +; O2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 +; O2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer ; O2-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; O2-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 ; O2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 ; O2-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; O2-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 ; O2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 ; O2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 ; O2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; O2-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 ; O2-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 ; O2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 ; O2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; O2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 ; O2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 ; O2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 ; O2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; O2-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 ; O2-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 ; O2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 ; O2-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; O2-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 ; O2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 ; O2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 ; O2-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; O2-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 ; O2-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 ; O2-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 ; O2-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; O2-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 ; O2-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 ; O2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 ; O2-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; O2-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 ; O2-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 ; O2-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 ; O2-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; O2-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 ; O2-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 ; O2-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 ; O2-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; O2-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 ; O2-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 ; O2-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 ; O2-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; O2-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 ; O2-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 ; O2-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 ; O2-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; O2-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 ; O2-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 ; O2-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 ; O2-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; O2-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 ; O2-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 ; O2-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 ; O2-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; O2-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 ; O2-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 ; O2-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 ; O2-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; O2-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 ; O2-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 @@ -253,115 +253,115 @@ ; ; O3-LABEL: @enabled( ; O3-NEXT: entry: -; O3-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 -; O3-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer +; O3-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 +; O3-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer ; O3-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; O3-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 ; O3-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 ; O3-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; O3-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 ; O3-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 ; O3-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 ; O3-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; O3-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 ; O3-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 ; O3-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 ; O3-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; O3-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 ; O3-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 ; O3-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 ; O3-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; O3-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 ; O3-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 ; O3-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 ; O3-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; O3-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 ; O3-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 ; O3-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 ; O3-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; O3-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 ; O3-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 ; O3-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 ; O3-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; O3-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 ; O3-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 ; O3-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 ; O3-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; O3-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 ; O3-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 ; O3-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 ; O3-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; O3-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 ; O3-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 ; O3-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 ; O3-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; O3-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 ; O3-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 ; O3-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 ; O3-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; O3-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 ; O3-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 ; O3-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 ; O3-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; O3-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 ; O3-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 ; O3-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 ; O3-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; O3-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 ; O3-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 ; O3-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 ; O3-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; O3-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 ; O3-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 ; O3-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 ; O3-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; O3-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 ; O3-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 @@ -370,115 +370,115 @@ ; ; O3DEFAULT-LABEL: @enabled( ; O3DEFAULT-NEXT: entry: -; O3DEFAULT-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 -; O3DEFAULT-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer +; O3DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 +; O3DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer ; O3DEFAULT-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; O3DEFAULT-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 ; O3DEFAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 ; O3DEFAULT-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; O3DEFAULT-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 ; O3DEFAULT-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 ; O3DEFAULT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 ; O3DEFAULT-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; O3DEFAULT-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 ; O3DEFAULT-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 ; O3DEFAULT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 ; O3DEFAULT-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; O3DEFAULT-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 ; O3DEFAULT-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 ; O3DEFAULT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 ; O3DEFAULT-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; O3DEFAULT-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 ; O3DEFAULT-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 ; O3DEFAULT-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 ; O3DEFAULT-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; O3DEFAULT-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 ; O3DEFAULT-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 ; O3DEFAULT-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 ; O3DEFAULT-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; O3DEFAULT-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 ; O3DEFAULT-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 ; O3DEFAULT-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 ; O3DEFAULT-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; O3DEFAULT-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 ; O3DEFAULT-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 ; O3DEFAULT-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 ; O3DEFAULT-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; O3DEFAULT-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 ; O3DEFAULT-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 ; O3DEFAULT-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 ; O3DEFAULT-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; O3DEFAULT-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 ; O3DEFAULT-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 ; O3DEFAULT-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 ; O3DEFAULT-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; O3DEFAULT-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 ; O3DEFAULT-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 ; O3DEFAULT-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 ; O3DEFAULT-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; O3DEFAULT-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 ; O3DEFAULT-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 ; O3DEFAULT-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 ; O3DEFAULT-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; O3DEFAULT-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 ; O3DEFAULT-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 ; O3DEFAULT-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 ; O3DEFAULT-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; O3DEFAULT-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 ; O3DEFAULT-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 ; O3DEFAULT-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 ; O3DEFAULT-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; O3DEFAULT-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 ; O3DEFAULT-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 ; O3DEFAULT-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 ; O3DEFAULT-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; O3DEFAULT-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 ; O3DEFAULT-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 @@ -487,115 +487,115 @@ ; ; Os-LABEL: @enabled( ; Os-NEXT: entry: -; Os-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 -; Os-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer +; Os-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 +; Os-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer ; Os-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; Os-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 ; Os-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 ; Os-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; Os-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 ; Os-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 ; Os-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 ; Os-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; Os-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 ; Os-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 ; Os-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 ; Os-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; Os-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 ; Os-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 ; Os-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 ; Os-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; Os-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 ; Os-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 ; Os-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 ; Os-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; Os-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 ; Os-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 ; Os-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 ; Os-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; Os-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 ; Os-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 ; Os-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 ; Os-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; Os-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 ; Os-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 ; Os-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 ; Os-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; Os-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 ; Os-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 ; Os-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 ; Os-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; Os-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 ; Os-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 ; Os-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 ; Os-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; Os-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 ; Os-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 ; Os-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 ; Os-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; Os-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 ; Os-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 ; Os-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 ; Os-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; Os-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 ; Os-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 ; Os-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 ; Os-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; Os-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 ; Os-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 ; Os-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 ; Os-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; Os-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 ; Os-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 ; Os-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 ; Os-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; Os-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 ; Os-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 @@ -604,115 +604,115 @@ ; ; Oz-LABEL: @enabled( ; Oz-NEXT: entry: -; Oz-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 -; Oz-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer +; Oz-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 +; Oz-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer ; Oz-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* ; Oz-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; Oz-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]] +; Oz-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; Oz-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* ; Oz-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 ; Oz-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 ; Oz-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* ; Oz-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; Oz-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]] +; Oz-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] ; Oz-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 ; Oz-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* ; Oz-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 ; Oz-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 ; Oz-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* ; Oz-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; Oz-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]] +; Oz-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] ; Oz-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 ; Oz-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* ; Oz-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 ; Oz-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 ; Oz-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* ; Oz-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; Oz-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]] +; Oz-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] ; Oz-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 ; Oz-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* ; Oz-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 ; Oz-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 ; Oz-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* ; Oz-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; Oz-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]] +; Oz-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] ; Oz-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 ; Oz-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* ; Oz-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 ; Oz-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 ; Oz-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* ; Oz-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; Oz-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]] +; Oz-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] ; Oz-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 ; Oz-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* ; Oz-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 ; Oz-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 ; Oz-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* ; Oz-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; Oz-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]] +; Oz-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] ; Oz-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 ; Oz-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* ; Oz-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 ; Oz-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 ; Oz-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* ; Oz-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; Oz-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]] +; Oz-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] ; Oz-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 ; Oz-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* ; Oz-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 ; Oz-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 ; Oz-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* ; Oz-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; Oz-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]] +; Oz-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] ; Oz-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 ; Oz-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* ; Oz-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 ; Oz-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 ; Oz-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* ; Oz-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; Oz-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]] +; Oz-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] ; Oz-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 ; Oz-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* ; Oz-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 ; Oz-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 ; Oz-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* ; Oz-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; Oz-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]] +; Oz-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] ; Oz-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 ; Oz-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* ; Oz-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 ; Oz-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 ; Oz-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* ; Oz-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; Oz-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]] +; Oz-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] ; Oz-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 ; Oz-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* ; Oz-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 ; Oz-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 ; Oz-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* ; Oz-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; Oz-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]] +; Oz-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] ; Oz-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 ; Oz-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* ; Oz-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 ; Oz-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 ; Oz-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* ; Oz-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; Oz-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]] +; Oz-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] ; Oz-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 ; Oz-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* ; Oz-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 ; Oz-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 ; Oz-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* ; Oz-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; Oz-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]] +; Oz-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] ; Oz-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 ; Oz-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* ; Oz-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 ; Oz-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 ; Oz-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* ; Oz-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; Oz-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]] +; Oz-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] ; Oz-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 ; Oz-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* ; Oz-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 @@ -721,115 +721,115 @@ ; ; O1VEC2-LABEL: @enabled( ; O1VEC2-NEXT: entry: -; O1VEC2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 -; O1VEC2-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer +; O1VEC2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 +; O1VEC2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer ; O1VEC2-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* ; O1VEC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; O1VEC2-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]] +; O1VEC2-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; O1VEC2-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* ; O1VEC2-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 ; O1VEC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 ; O1VEC2-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* ; O1VEC2-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; O1VEC2-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]] +; O1VEC2-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] ; O1VEC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 ; O1VEC2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* ; O1VEC2-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 ; O1VEC2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 ; O1VEC2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* ; O1VEC2-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; O1VEC2-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]] +; O1VEC2-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] ; O1VEC2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 ; O1VEC2-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* ; O1VEC2-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 ; O1VEC2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 ; O1VEC2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* ; O1VEC2-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; O1VEC2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]] +; O1VEC2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] ; O1VEC2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 ; O1VEC2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* ; O1VEC2-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 ; O1VEC2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 ; O1VEC2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* ; O1VEC2-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; O1VEC2-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]] +; O1VEC2-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] ; O1VEC2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 ; O1VEC2-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* ; O1VEC2-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 ; O1VEC2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 ; O1VEC2-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* ; O1VEC2-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; O1VEC2-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]] +; O1VEC2-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] ; O1VEC2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 ; O1VEC2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* ; O1VEC2-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 ; O1VEC2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 ; O1VEC2-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* ; O1VEC2-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; O1VEC2-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]] +; O1VEC2-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] ; O1VEC2-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 ; O1VEC2-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* ; O1VEC2-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 ; O1VEC2-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 ; O1VEC2-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* ; O1VEC2-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; O1VEC2-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]] +; O1VEC2-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] ; O1VEC2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 ; O1VEC2-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* ; O1VEC2-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 ; O1VEC2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 ; O1VEC2-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* ; O1VEC2-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; O1VEC2-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]] +; O1VEC2-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] ; O1VEC2-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 ; O1VEC2-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* ; O1VEC2-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 ; O1VEC2-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 ; O1VEC2-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* ; O1VEC2-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; O1VEC2-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]] +; O1VEC2-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] ; O1VEC2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 ; O1VEC2-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* ; O1VEC2-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 ; O1VEC2-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 ; O1VEC2-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* ; O1VEC2-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; O1VEC2-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]] +; O1VEC2-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] ; O1VEC2-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 ; O1VEC2-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* ; O1VEC2-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 ; O1VEC2-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 ; O1VEC2-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* ; O1VEC2-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; O1VEC2-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]] +; O1VEC2-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] ; O1VEC2-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 ; O1VEC2-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* ; O1VEC2-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 ; O1VEC2-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 ; O1VEC2-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* ; O1VEC2-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; O1VEC2-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]] +; O1VEC2-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] ; O1VEC2-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 ; O1VEC2-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* ; O1VEC2-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 ; O1VEC2-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 ; O1VEC2-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* ; O1VEC2-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; O1VEC2-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]] +; O1VEC2-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] ; O1VEC2-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 ; O1VEC2-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* ; O1VEC2-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 ; O1VEC2-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 ; O1VEC2-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* ; O1VEC2-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; O1VEC2-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]] +; O1VEC2-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] ; O1VEC2-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 ; O1VEC2-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* ; O1VEC2-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 ; O1VEC2-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 ; O1VEC2-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* ; O1VEC2-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; O1VEC2-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]] +; O1VEC2-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] ; O1VEC2-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 ; O1VEC2-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* ; O1VEC2-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 @@ -838,115 +838,115 @@ ; ; OzVEC2-LABEL: @enabled( ; OzVEC2-NEXT: entry: -; OzVEC2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 -; OzVEC2-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer +; OzVEC2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 +; OzVEC2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer ; OzVEC2-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* ; OzVEC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; OzVEC2-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]] +; OzVEC2-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; OzVEC2-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* ; OzVEC2-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 ; OzVEC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 ; OzVEC2-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* ; OzVEC2-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; OzVEC2-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]] +; OzVEC2-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] ; OzVEC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 ; OzVEC2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* ; OzVEC2-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 ; OzVEC2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 ; OzVEC2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* ; OzVEC2-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; OzVEC2-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]] +; OzVEC2-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] ; OzVEC2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 ; OzVEC2-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* ; OzVEC2-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 ; OzVEC2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 ; OzVEC2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* ; OzVEC2-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; OzVEC2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]] +; OzVEC2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] ; OzVEC2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 ; OzVEC2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* ; OzVEC2-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 ; OzVEC2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 ; OzVEC2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* ; OzVEC2-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; OzVEC2-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]] +; OzVEC2-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] ; OzVEC2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 ; OzVEC2-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* ; OzVEC2-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 ; OzVEC2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 ; OzVEC2-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* ; OzVEC2-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; OzVEC2-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]] +; OzVEC2-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] ; OzVEC2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 ; OzVEC2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* ; OzVEC2-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 ; OzVEC2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 ; OzVEC2-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* ; OzVEC2-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; OzVEC2-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]] +; OzVEC2-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] ; OzVEC2-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 ; OzVEC2-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* ; OzVEC2-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 ; OzVEC2-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 ; OzVEC2-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* ; OzVEC2-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; OzVEC2-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]] +; OzVEC2-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] ; OzVEC2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 ; OzVEC2-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* ; OzVEC2-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 ; OzVEC2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 ; OzVEC2-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* ; OzVEC2-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; OzVEC2-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]] +; OzVEC2-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] ; OzVEC2-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 ; OzVEC2-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* ; OzVEC2-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 ; OzVEC2-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 ; OzVEC2-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* ; OzVEC2-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; OzVEC2-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]] +; OzVEC2-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] ; OzVEC2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 ; OzVEC2-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* ; OzVEC2-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 ; OzVEC2-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 ; OzVEC2-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* ; OzVEC2-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; OzVEC2-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]] +; OzVEC2-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] ; OzVEC2-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 ; OzVEC2-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* ; OzVEC2-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 ; OzVEC2-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 ; OzVEC2-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* ; OzVEC2-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; OzVEC2-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]] +; OzVEC2-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] ; OzVEC2-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 ; OzVEC2-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* ; OzVEC2-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 ; OzVEC2-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 ; OzVEC2-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* ; OzVEC2-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; OzVEC2-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]] +; OzVEC2-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] ; OzVEC2-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 ; OzVEC2-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* ; OzVEC2-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 ; OzVEC2-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 ; OzVEC2-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* ; OzVEC2-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; OzVEC2-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]] +; OzVEC2-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] ; OzVEC2-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 ; OzVEC2-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* ; OzVEC2-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 ; OzVEC2-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 ; OzVEC2-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* ; OzVEC2-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; OzVEC2-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]] +; OzVEC2-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] ; OzVEC2-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 ; OzVEC2-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* ; OzVEC2-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 ; OzVEC2-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 ; OzVEC2-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* ; OzVEC2-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; OzVEC2-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]] +; OzVEC2-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] ; OzVEC2-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 ; OzVEC2-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* ; OzVEC2-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 @@ -955,115 +955,115 @@ ; ; O3DIS-LABEL: @enabled( ; O3DIS-NEXT: entry: -; O3DIS-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 -; O3DIS-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer +; O3DIS-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 +; O3DIS-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer ; O3DIS-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* ; O3DIS-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; O3DIS-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]] +; O3DIS-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; O3DIS-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* ; O3DIS-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 ; O3DIS-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 ; O3DIS-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* ; O3DIS-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; O3DIS-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]] +; O3DIS-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] ; O3DIS-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 ; O3DIS-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* ; O3DIS-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 ; O3DIS-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 ; O3DIS-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* ; O3DIS-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; O3DIS-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]] +; O3DIS-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] ; O3DIS-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 ; O3DIS-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* ; O3DIS-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 ; O3DIS-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 ; O3DIS-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* ; O3DIS-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; O3DIS-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]] +; O3DIS-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] ; O3DIS-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 ; O3DIS-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* ; O3DIS-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 ; O3DIS-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 ; O3DIS-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* ; O3DIS-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; O3DIS-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]] +; O3DIS-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] ; O3DIS-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 ; O3DIS-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* ; O3DIS-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 ; O3DIS-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 ; O3DIS-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* ; O3DIS-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; O3DIS-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]] +; O3DIS-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] ; O3DIS-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 ; O3DIS-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* ; O3DIS-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 ; O3DIS-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 ; O3DIS-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* ; O3DIS-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; O3DIS-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]] +; O3DIS-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] ; O3DIS-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 ; O3DIS-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* ; O3DIS-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 ; O3DIS-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 ; O3DIS-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* ; O3DIS-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; O3DIS-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]] +; O3DIS-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] ; O3DIS-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 ; O3DIS-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* ; O3DIS-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 ; O3DIS-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 ; O3DIS-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* ; O3DIS-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; O3DIS-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]] +; O3DIS-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] ; O3DIS-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 ; O3DIS-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* ; O3DIS-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 ; O3DIS-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 ; O3DIS-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* ; O3DIS-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; O3DIS-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]] +; O3DIS-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] ; O3DIS-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 ; O3DIS-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* ; O3DIS-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 ; O3DIS-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 ; O3DIS-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* ; O3DIS-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; O3DIS-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]] +; O3DIS-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] ; O3DIS-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 ; O3DIS-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* ; O3DIS-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 ; O3DIS-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 ; O3DIS-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* ; O3DIS-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; O3DIS-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]] +; O3DIS-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] ; O3DIS-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 ; O3DIS-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* ; O3DIS-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 ; O3DIS-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 ; O3DIS-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* ; O3DIS-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; O3DIS-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]] +; O3DIS-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] ; O3DIS-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 ; O3DIS-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* ; O3DIS-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 ; O3DIS-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 ; O3DIS-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* ; O3DIS-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; O3DIS-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]] +; O3DIS-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] ; O3DIS-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 ; O3DIS-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* ; O3DIS-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 ; O3DIS-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 ; O3DIS-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* ; O3DIS-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; O3DIS-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]] +; O3DIS-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] ; O3DIS-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 ; O3DIS-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* ; O3DIS-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 ; O3DIS-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 ; O3DIS-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* ; O3DIS-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; O3DIS-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]] +; O3DIS-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] ; O3DIS-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 ; O3DIS-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* ; O3DIS-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 @@ -1109,115 +1109,115 @@ ; ; O2-LABEL: @nopragma( ; O2-NEXT: entry: -; O2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 -; O2-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer +; O2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 +; O2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer ; O2-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; O2-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 ; O2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 ; O2-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; O2-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 ; O2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 ; O2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 ; O2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; O2-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 ; O2-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 ; O2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 ; O2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; O2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 ; O2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 ; O2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 ; O2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; O2-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 ; O2-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 ; O2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 ; O2-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; O2-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 ; O2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 ; O2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 ; O2-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; O2-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 ; O2-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 ; O2-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 ; O2-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; O2-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 ; O2-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 ; O2-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 ; O2-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; O2-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 ; O2-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 ; O2-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 ; O2-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; O2-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 ; O2-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 ; O2-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 ; O2-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; O2-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 ; O2-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 ; O2-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 ; O2-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; O2-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 ; O2-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 ; O2-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 ; O2-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; O2-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 ; O2-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 ; O2-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 ; O2-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; O2-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 ; O2-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 ; O2-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 ; O2-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; O2-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 ; O2-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 ; O2-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 ; O2-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* ; O2-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; O2-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]] +; O2-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] ; O2-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 ; O2-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* ; O2-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 @@ -1226,115 +1226,115 @@ ; ; O3-LABEL: @nopragma( ; O3-NEXT: entry: -; O3-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 -; O3-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer +; O3-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 +; O3-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer ; O3-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; O3-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 ; O3-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 ; O3-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; O3-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 ; O3-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 ; O3-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 ; O3-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; O3-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 ; O3-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 ; O3-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 ; O3-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; O3-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 ; O3-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 ; O3-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 ; O3-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; O3-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 ; O3-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 ; O3-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 ; O3-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; O3-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 ; O3-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 ; O3-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 ; O3-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; O3-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 ; O3-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 ; O3-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 ; O3-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; O3-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 ; O3-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 ; O3-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 ; O3-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; O3-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 ; O3-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 ; O3-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 ; O3-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; O3-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 ; O3-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 ; O3-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 ; O3-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; O3-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 ; O3-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 ; O3-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 ; O3-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; O3-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 ; O3-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 ; O3-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 ; O3-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; O3-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 ; O3-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 ; O3-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 ; O3-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; O3-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 ; O3-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 ; O3-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 ; O3-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; O3-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 ; O3-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 ; O3-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 ; O3-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* ; O3-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; O3-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]] +; O3-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] ; O3-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 ; O3-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* ; O3-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 @@ -1343,115 +1343,115 @@ ; ; O3DEFAULT-LABEL: @nopragma( ; O3DEFAULT-NEXT: entry: -; O3DEFAULT-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 -; O3DEFAULT-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer +; O3DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 +; O3DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer ; O3DEFAULT-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; O3DEFAULT-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 ; O3DEFAULT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 ; O3DEFAULT-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; O3DEFAULT-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 ; O3DEFAULT-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 ; O3DEFAULT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 ; O3DEFAULT-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; O3DEFAULT-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 ; O3DEFAULT-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 ; O3DEFAULT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 ; O3DEFAULT-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; O3DEFAULT-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 ; O3DEFAULT-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 ; O3DEFAULT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 ; O3DEFAULT-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; O3DEFAULT-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 ; O3DEFAULT-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 ; O3DEFAULT-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 ; O3DEFAULT-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; O3DEFAULT-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 ; O3DEFAULT-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 ; O3DEFAULT-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 ; O3DEFAULT-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; O3DEFAULT-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 ; O3DEFAULT-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 ; O3DEFAULT-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 ; O3DEFAULT-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; O3DEFAULT-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 ; O3DEFAULT-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 ; O3DEFAULT-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 ; O3DEFAULT-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; O3DEFAULT-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 ; O3DEFAULT-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 ; O3DEFAULT-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 ; O3DEFAULT-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; O3DEFAULT-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 ; O3DEFAULT-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 ; O3DEFAULT-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 ; O3DEFAULT-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; O3DEFAULT-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 ; O3DEFAULT-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 ; O3DEFAULT-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 ; O3DEFAULT-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; O3DEFAULT-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 ; O3DEFAULT-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 ; O3DEFAULT-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 ; O3DEFAULT-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; O3DEFAULT-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 ; O3DEFAULT-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 ; O3DEFAULT-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 ; O3DEFAULT-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; O3DEFAULT-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 ; O3DEFAULT-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 ; O3DEFAULT-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 ; O3DEFAULT-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; O3DEFAULT-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 ; O3DEFAULT-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 ; O3DEFAULT-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 ; O3DEFAULT-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* ; O3DEFAULT-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; O3DEFAULT-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]] +; O3DEFAULT-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] ; O3DEFAULT-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 ; O3DEFAULT-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* ; O3DEFAULT-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 @@ -1460,115 +1460,115 @@ ; ; Os-LABEL: @nopragma( ; Os-NEXT: entry: -; Os-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 -; Os-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer +; Os-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 +; Os-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer ; Os-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; Os-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 ; Os-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4 ; Os-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; Os-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 ; Os-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4 ; Os-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8 ; Os-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 -; Os-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8 ; Os-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4 ; Os-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12 ; Os-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4 -; Os-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 ; Os-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4 ; Os-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16 ; Os-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4 -; Os-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16 ; Os-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4 ; Os-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20 ; Os-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; Os-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20 ; Os-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4 ; Os-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24 ; Os-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 -; Os-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24 ; Os-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4 ; Os-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28 ; Os-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4 -; Os-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28 ; Os-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4 ; Os-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32 ; Os-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4 -; Os-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32 ; Os-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4 ; Os-NEXT: [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36 ; Os-NEXT: [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4 -; Os-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36 ; Os-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4 ; Os-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40 ; Os-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4 -; Os-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40 ; Os-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4 ; Os-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44 ; Os-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4 -; Os-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44 ; Os-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4 ; Os-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48 ; Os-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 -; Os-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48 ; Os-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4 ; Os-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52 ; Os-NEXT: [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4 -; Os-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52 ; Os-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4 ; Os-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56 ; Os-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 -; Os-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56 ; Os-NEXT: [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4 ; Os-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60 ; Os-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>* ; Os-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4 -; Os-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]] +; Os-NEXT: [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]] ; Os-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60 ; Os-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>* ; Os-NEXT: store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4 @@ -1596,20 +1596,17 @@ ; O1VEC2-NEXT: entry: ; O1VEC2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; O1VEC2: vector.ph: -; O1VEC2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 -; O1VEC2-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer +; O1VEC2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 +; O1VEC2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer ; O1VEC2-NEXT: br label [[VECTOR_BODY:%.*]] ; O1VEC2: vector.body: ; O1VEC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; O1VEC2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0 -; O1VEC2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer -; O1VEC2-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], ; O1VEC2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; O1VEC2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] ; O1VEC2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 ; O1VEC2-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* ; O1VEC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; O1VEC2-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]] +; O1VEC2-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; O1VEC2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] ; O1VEC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 ; O1VEC2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* @@ -1641,20 +1638,17 @@ ; OzVEC2-NEXT: entry: ; OzVEC2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; OzVEC2: vector.ph: -; OzVEC2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 -; OzVEC2-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer +; OzVEC2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 +; OzVEC2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer ; OzVEC2-NEXT: br label [[VECTOR_BODY:%.*]] ; OzVEC2: vector.body: ; OzVEC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; OzVEC2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0 -; OzVEC2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer -; OzVEC2-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], ; OzVEC2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; OzVEC2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] ; OzVEC2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 ; OzVEC2-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* ; OzVEC2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; OzVEC2-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]] +; OzVEC2-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; OzVEC2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] ; OzVEC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 ; OzVEC2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* Index: llvm/test/Transforms/LoopVectorize/X86/optsize.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/optsize.ll +++ llvm/test/Transforms/LoopVectorize/X86/optsize.ll @@ -17,12 +17,11 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> undef, i32 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> undef, <64 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> undef, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> undef, <64 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <64 x i32> [[INDUCTION]], +; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <64 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP2]], <64 x i8> undef) @@ -52,6 +51,47 @@ ; CHECK: for.end: ; CHECK-NEXT: ret i32 0 ; +; AUTOVF-LABEL: @foo_optsize( +; AUTOVF-NEXT: entry: +; AUTOVF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; AUTOVF: vector.ph: +; AUTOVF-NEXT: br label [[VECTOR_BODY:%.*]] +; AUTOVF: vector.body: +; AUTOVF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AUTOVF-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; AUTOVF-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <32 x i32> undef, i32 [[TMP0]], i32 0 +; AUTOVF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x i32> [[BROADCAST_SPLATINSERT]], <32 x i32> undef, <32 x i32> zeroinitializer +; AUTOVF-NEXT: [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]] +; AUTOVF-NEXT: [[TMP2:%.*]] = icmp ule <32 x i32> [[BROADCAST_SPLAT]], +; AUTOVF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 +; AUTOVF-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>* +; AUTOVF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* [[TMP4]], i32 1, <32 x i1> [[TMP2]], <32 x i8> undef) +; AUTOVF-NEXT: [[TMP5:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer +; AUTOVF-NEXT: [[TMP6:%.*]] = extractelement <32 x i1> [[TMP5]], i32 0 +; AUTOVF-NEXT: [[TMP7:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> , <32 x i8> +; AUTOVF-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>* +; AUTOVF-NEXT: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> [[TMP7]], <32 x i8>* [[TMP8]], i32 1, <32 x i1> [[TMP2]]) +; AUTOVF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 32 +; AUTOVF-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224 +; AUTOVF-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; AUTOVF: middle.block: +; AUTOVF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; AUTOVF: scalar.ph: +; AUTOVF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 224, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; AUTOVF-NEXT: br label [[FOR_BODY:%.*]] +; AUTOVF: for.body: +; AUTOVF-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; AUTOVF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] +; AUTOVF-NEXT: [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AUTOVF-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP10]], 0 +; AUTOVF-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 +; AUTOVF-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 +; AUTOVF-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 +; AUTOVF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202 +; AUTOVF-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !2 +; AUTOVF: for.end: +; AUTOVF-NEXT: ret i32 0 +; entry: br label %for.body @@ -81,12 +121,11 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> undef, i32 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> undef, <64 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> undef, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> undef, <64 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <64 x i32> [[INDUCTION]], +; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <64 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP2]], <64 x i8> undef) @@ -116,6 +155,47 @@ ; CHECK: for.end: ; CHECK-NEXT: ret i32 0 ; +; AUTOVF-LABEL: @foo_minsize( +; AUTOVF-NEXT: entry: +; AUTOVF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; AUTOVF: vector.ph: +; AUTOVF-NEXT: br label [[VECTOR_BODY:%.*]] +; AUTOVF: vector.body: +; AUTOVF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AUTOVF-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; AUTOVF-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <32 x i32> undef, i32 [[TMP0]], i32 0 +; AUTOVF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x i32> [[BROADCAST_SPLATINSERT]], <32 x i32> undef, <32 x i32> zeroinitializer +; AUTOVF-NEXT: [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]] +; AUTOVF-NEXT: [[TMP2:%.*]] = icmp ule <32 x i32> [[BROADCAST_SPLAT]], +; AUTOVF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 +; AUTOVF-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>* +; AUTOVF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* [[TMP4]], i32 1, <32 x i1> [[TMP2]], <32 x i8> undef) +; AUTOVF-NEXT: [[TMP5:%.*]] = icmp eq <32 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer +; AUTOVF-NEXT: [[TMP6:%.*]] = extractelement <32 x i1> [[TMP5]], i32 0 +; AUTOVF-NEXT: [[TMP7:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> , <32 x i8> +; AUTOVF-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>* +; AUTOVF-NEXT: call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> [[TMP7]], <32 x i8>* [[TMP8]], i32 1, <32 x i1> [[TMP2]]) +; AUTOVF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 32 +; AUTOVF-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224 +; AUTOVF-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 +; AUTOVF: middle.block: +; AUTOVF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; AUTOVF: scalar.ph: +; AUTOVF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 224, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; AUTOVF-NEXT: br label [[FOR_BODY:%.*]] +; AUTOVF: for.body: +; AUTOVF-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; AUTOVF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] +; AUTOVF-NEXT: [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AUTOVF-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP10]], 0 +; AUTOVF-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 +; AUTOVF-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 +; AUTOVF-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 +; AUTOVF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202 +; AUTOVF-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !5 +; AUTOVF: for.end: +; AUTOVF-NEXT: ret i32 0 +; entry: br label %for.body @@ -140,15 +220,39 @@ ; We can't vectorize this one because we version for stride==1; even having TC ; a multiple of VF. -; CHECK-LABEL: @scev4stride1 -; CHECK-NOT: vector.scevcheck -; CHECK-NOT: vector.body: -; CHECK-LABEL: for.body: -; AUTOVF-LABEL: @scev4stride1 -; AUTOVF-NOT: vector.scevcheck -; AUTOVF-NOT: vector.body: -; AUTOVF-LABEL: for.body: define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #2 { +; CHECK-LABEL: @scev4stride1( +; CHECK-NEXT: for.body.preheader: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ] +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_07]], [[K:%.*]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[MUL]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_07]] +; CHECK-NEXT: store i32 [[TMP0]], i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_07]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 256 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: ret void +; +; AUTOVF-LABEL: @scev4stride1( +; AUTOVF-NEXT: for.body.preheader: +; AUTOVF-NEXT: br label [[FOR_BODY:%.*]] +; AUTOVF: for.body: +; AUTOVF-NEXT: [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ] +; AUTOVF-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_07]], [[K:%.*]] +; AUTOVF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[MUL]] +; AUTOVF-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AUTOVF-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_07]] +; AUTOVF-NEXT: store i32 [[TMP0]], i32* [[ARRAYIDX1]], align 4 +; AUTOVF-NEXT: [[INC]] = add nuw nsw i32 [[I_07]], 1 +; AUTOVF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 256 +; AUTOVF-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]] +; AUTOVF: for.end.loopexit: +; AUTOVF-NEXT: ret void +; for.body.preheader: br label %for.body @@ -174,15 +278,31 @@ ; We can't vectorize this one because we version for overflow check and tiny ; trip count leads to opt-for-size (which otherwise could fold the tail by ; masking). -; CHECK-LABEL: @main -; CHECK-NOT: vector.scevcheck -; CHECK-NOT: vector.body: -; CHECK-LABEL: for.cond: -; AUTOVF-LABEL: @main -; AUTOVF-NOT: vector.scevcheck -; AUTOVF-NOT: vector.body: -; AUTOVF-LABEL: for.cond: define i32 @main() local_unnamed_addr { +; CHECK-LABEL: @main( +; CHECK-NEXT: while.cond: +; CHECK-NEXT: br label [[FOR_COND:%.*]] +; CHECK: for.cond: +; CHECK-NEXT: [[D_0:%.*]] = phi i32 [ 0, [[WHILE_COND:%.*]] ], [ [[ADD:%.*]], [[FOR_COND]] ] +; CHECK-NEXT: [[CONV:%.*]] = and i32 [[D_0]], 65535 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[CONV]], 4 +; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[CONV]], 1 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND]], label [[WHILE_COND_LOOPEXIT:%.*]] +; CHECK: while.cond.loopexit: +; CHECK-NEXT: ret i32 0 +; +; AUTOVF-LABEL: @main( +; AUTOVF-NEXT: while.cond: +; AUTOVF-NEXT: br label [[FOR_COND:%.*]] +; AUTOVF: for.cond: +; AUTOVF-NEXT: [[D_0:%.*]] = phi i32 [ 0, [[WHILE_COND:%.*]] ], [ [[ADD:%.*]], [[FOR_COND]] ] +; AUTOVF-NEXT: [[CONV:%.*]] = and i32 [[D_0]], 65535 +; AUTOVF-NEXT: [[CMP:%.*]] = icmp ult i32 [[CONV]], 4 +; AUTOVF-NEXT: [[ADD]] = add nuw nsw i32 [[CONV]], 1 +; AUTOVF-NEXT: br i1 [[CMP]], label [[FOR_COND]], label [[WHILE_COND_LOOPEXIT:%.*]] +; AUTOVF: while.cond.loopexit: +; AUTOVF-NEXT: ret i32 0 +; while.cond: br label %for.cond Index: llvm/test/Transforms/LoopVectorize/X86/pr35432.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/pr35432.ll +++ llvm/test/Transforms/LoopVectorize/X86/pr35432.ll @@ -40,16 +40,16 @@ ; CHECK-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i32 ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], 1 ; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[UMAX:%.*]] = select i1 [[TMP6]], i32 [[TMP2]], i32 [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = sub i32 [[TMP5]], [[UMAX]] +; CHECK-NEXT: [[UMIN:%.*]] = select i1 [[TMP6]], i32 [[TMP2]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = sub i32 [[TMP5]], [[UMIN]] ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP7]], 8 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[TMP8:%.*]] = add i8 [[CONV3]], -1 ; CHECK-NEXT: [[TMP9:%.*]] = zext i8 [[TMP8]] to i32 ; CHECK-NEXT: [[TMP10:%.*]] = icmp ult i32 [[TMP2]], [[TMP9]] -; CHECK-NEXT: [[UMAX1:%.*]] = select i1 [[TMP10]], i32 [[TMP2]], i32 [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[UMAX1]] +; CHECK-NEXT: [[UMIN1:%.*]] = select i1 [[TMP10]], i32 [[TMP2]], i32 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[UMIN1]] ; CHECK-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i8 ; CHECK-NEXT: [[MUL:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 1, i8 [[TMP12]]) ; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i8, i1 } [[MUL]], 0 @@ -77,10 +77,6 @@ ; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[INDEX]] to i8 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i8 [[CONV3]], [[TMP23]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> undef, i8 [[OFFSET_IDX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i8> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[INDUCTION3:%.*]] = add <4 x i8> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP24:%.*]] = add i8 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP25:%.*]] = add i8 [[OFFSET_IDX]], -4 ; CHECK-NEXT: [[TMP26]] = add <4 x i32> [[VEC_PHI]], @@ -95,10 +91,10 @@ ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP27]], [[TMP26]] ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX4:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <4 x i32> [[BIN_RDX4]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX6:%.*]] = add <4 x i32> [[BIN_RDX4]], [[RDX_SHUF5]] -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[BIN_RDX6]], i32 0 +; CHECK-NEXT: [[BIN_RDX3:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <4 x i32> [[BIN_RDX3]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX5:%.*]] = add <4 x i32> [[BIN_RDX3]], [[RDX_SHUF4]] +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[BIN_RDX5]], i32 0 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP7]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND4_FOR_INC9_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: Index: llvm/test/Transforms/LoopVectorize/X86/pr36524.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/pr36524.ll +++ llvm/test/Transforms/LoopVectorize/X86/pr36524.ll @@ -14,9 +14,6 @@ ; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 3 ; CHECK-NEXT: [[OFFSET_IDX1:%.*]] = add i64 2, [[INDEX]] ; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[OFFSET_IDX1]] to i32 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP11]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 0 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], Index: llvm/test/Transforms/LoopVectorize/X86/small-size.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/small-size.ll +++ llvm/test/Transforms/LoopVectorize/X86/small-size.ll @@ -142,19 +142,19 @@ ; CHECK: vector.ph10: ; CHECK-NEXT: [[N_RND_UP11:%.*]] = add nuw nsw i64 [[TMP19]], 4 ; CHECK-NEXT: [[N_VEC13:%.*]] = and i64 [[N_RND_UP11]], 8589934588 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <4 x i64> undef, i64 [[TMP19]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT29:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT28]], <4 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <4 x i64> undef, i64 [[TMP19]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT21:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT20]], <4 x i64> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY9:%.*]] ; CHECK: vector.body9: -; CHECK-NEXT: [[INDEX14:%.*]] = phi i64 [ 0, [[VECTOR_PH10]] ], [ [[INDEX_NEXT15:%.*]], [[PRED_STORE_CONTINUE51:%.*]] ] +; CHECK-NEXT: [[INDEX14:%.*]] = phi i64 [ 0, [[VECTOR_PH10]] ], [ [[INDEX_NEXT15:%.*]], [[PRED_STORE_CONTINUE46:%.*]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[I_0_LCSSA]], [[INDEX14]] ; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 1 ; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 2 ; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 3 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT26:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX14]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT27:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT26]], <4 x i64> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[VEC_IV:%.*]] = or <4 x i64> [[BROADCAST_SPLAT27]], -; CHECK-NEXT: [[TMP23:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT29]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT23:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX14]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT24:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT23]], <4 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or <4 x i64> [[BROADCAST_SPLAT24]], +; CHECK-NEXT: [[TMP23:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT21]] ; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP23]], i32 0 ; CHECK-NEXT: br i1 [[TMP24]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -165,101 +165,101 @@ ; CHECK: pred.load.continue: ; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ undef, [[VECTOR_BODY9]] ], [ [[TMP27]], [[PRED_LOAD_IF]] ] ; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[TMP23]], i32 1 -; CHECK-NEXT: br i1 [[TMP29]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]] -; CHECK: pred.load.if30: +; CHECK-NEXT: br i1 [[TMP29]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] +; CHECK: pred.load.if25: ; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP20]] ; CHECK-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 ; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP31]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE31]] -; CHECK: pred.load.continue31: -; CHECK-NEXT: [[TMP33:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP32]], [[PRED_LOAD_IF30]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE26]] +; CHECK: pred.load.continue26: +; CHECK-NEXT: [[TMP33:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP32]], [[PRED_LOAD_IF25]] ] ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP23]], i32 2 -; CHECK-NEXT: br i1 [[TMP34]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33:%.*]] -; CHECK: pred.load.if32: +; CHECK-NEXT: br i1 [[TMP34]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] +; CHECK: pred.load.if27: ; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP21]] ; CHECK-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 ; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP36]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE33]] -; CHECK: pred.load.continue33: -; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_LOAD_CONTINUE31]] ], [ [[TMP37]], [[PRED_LOAD_IF32]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE28]] +; CHECK: pred.load.continue28: +; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP37]], [[PRED_LOAD_IF27]] ] ; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i1> [[TMP23]], i32 3 -; CHECK-NEXT: br i1 [[TMP39]], label [[PRED_LOAD_IF34:%.*]], label [[PRED_LOAD_CONTINUE35:%.*]] -; CHECK: pred.load.if34: +; CHECK-NEXT: br i1 [[TMP39]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] +; CHECK: pred.load.if29: ; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP22]] ; CHECK-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 ; CHECK-NEXT: [[TMP42:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP41]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE35]] -; CHECK: pred.load.continue35: -; CHECK-NEXT: [[TMP43:%.*]] = phi <4 x i32> [ [[TMP38]], [[PRED_LOAD_CONTINUE33]] ], [ [[TMP42]], [[PRED_LOAD_IF34]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE30]] +; CHECK: pred.load.continue30: +; CHECK-NEXT: [[TMP43:%.*]] = phi <4 x i32> [ [[TMP38]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP42]], [[PRED_LOAD_IF29]] ] ; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i1> [[TMP23]], i32 0 -; CHECK-NEXT: br i1 [[TMP44]], label [[PRED_LOAD_IF36:%.*]], label [[PRED_LOAD_CONTINUE37:%.*]] -; CHECK: pred.load.if36: +; CHECK-NEXT: br i1 [[TMP44]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] +; CHECK: pred.load.if31: ; CHECK-NEXT: [[TMP45:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 ; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i32> undef, i32 [[TMP46]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE37]] -; CHECK: pred.load.continue37: -; CHECK-NEXT: [[TMP48:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE35]] ], [ [[TMP47]], [[PRED_LOAD_IF36]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE32]] +; CHECK: pred.load.continue32: +; CHECK-NEXT: [[TMP48:%.*]] = phi <4 x i32> [ undef, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP47]], [[PRED_LOAD_IF31]] ] ; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i1> [[TMP23]], i32 1 -; CHECK-NEXT: br i1 [[TMP49]], label [[PRED_LOAD_IF38:%.*]], label [[PRED_LOAD_CONTINUE39:%.*]] -; CHECK: pred.load.if38: +; CHECK-NEXT: br i1 [[TMP49]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] +; CHECK: pred.load.if33: ; CHECK-NEXT: [[TMP50:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[TMP20]] ; CHECK-NEXT: [[TMP51:%.*]] = load i32, i32* [[TMP50]], align 4 ; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i32> [[TMP48]], i32 [[TMP51]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE39]] -; CHECK: pred.load.continue39: -; CHECK-NEXT: [[TMP53:%.*]] = phi <4 x i32> [ [[TMP48]], [[PRED_LOAD_CONTINUE37]] ], [ [[TMP52]], [[PRED_LOAD_IF38]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE34]] +; CHECK: pred.load.continue34: +; CHECK-NEXT: [[TMP53:%.*]] = phi <4 x i32> [ [[TMP48]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP52]], [[PRED_LOAD_IF33]] ] ; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i1> [[TMP23]], i32 2 -; CHECK-NEXT: br i1 [[TMP54]], label [[PRED_LOAD_IF40:%.*]], label [[PRED_LOAD_CONTINUE41:%.*]] -; CHECK: pred.load.if40: +; CHECK-NEXT: br i1 [[TMP54]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]] +; CHECK: pred.load.if35: ; CHECK-NEXT: [[TMP55:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[TMP21]] ; CHECK-NEXT: [[TMP56:%.*]] = load i32, i32* [[TMP55]], align 4 ; CHECK-NEXT: [[TMP57:%.*]] = insertelement <4 x i32> [[TMP53]], i32 [[TMP56]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE41]] -; CHECK: pred.load.continue41: -; CHECK-NEXT: [[TMP58:%.*]] = phi <4 x i32> [ [[TMP53]], [[PRED_LOAD_CONTINUE39]] ], [ [[TMP57]], [[PRED_LOAD_IF40]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE36]] +; CHECK: pred.load.continue36: +; CHECK-NEXT: [[TMP58:%.*]] = phi <4 x i32> [ [[TMP53]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP57]], [[PRED_LOAD_IF35]] ] ; CHECK-NEXT: [[TMP59:%.*]] = extractelement <4 x i1> [[TMP23]], i32 3 -; CHECK-NEXT: br i1 [[TMP59]], label [[PRED_LOAD_IF42:%.*]], label [[PRED_LOAD_CONTINUE43:%.*]] -; CHECK: pred.load.if42: +; CHECK-NEXT: br i1 [[TMP59]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]] +; CHECK: pred.load.if37: ; CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[TMP22]] ; CHECK-NEXT: [[TMP61:%.*]] = load i32, i32* [[TMP60]], align 4 ; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i32> [[TMP58]], i32 [[TMP61]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE43]] -; CHECK: pred.load.continue43: -; CHECK-NEXT: [[TMP63:%.*]] = phi <4 x i32> [ [[TMP58]], [[PRED_LOAD_CONTINUE41]] ], [ [[TMP62]], [[PRED_LOAD_IF42]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE38]] +; CHECK: pred.load.continue38: +; CHECK-NEXT: [[TMP63:%.*]] = phi <4 x i32> [ [[TMP58]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP62]], [[PRED_LOAD_IF37]] ] ; CHECK-NEXT: [[TMP64:%.*]] = and <4 x i32> [[TMP63]], [[TMP43]] ; CHECK-NEXT: [[TMP65:%.*]] = extractelement <4 x i1> [[TMP23]], i32 0 -; CHECK-NEXT: br i1 [[TMP65]], label [[PRED_STORE_IF44:%.*]], label [[PRED_STORE_CONTINUE45:%.*]] -; CHECK: pred.store.if44: +; CHECK-NEXT: br i1 [[TMP65]], label [[PRED_STORE_IF39:%.*]], label [[PRED_STORE_CONTINUE40:%.*]] +; CHECK: pred.store.if39: ; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP67:%.*]] = extractelement <4 x i32> [[TMP64]], i32 0 ; CHECK-NEXT: store i32 [[TMP67]], i32* [[TMP66]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE45]] -; CHECK: pred.store.continue45: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE40]] +; CHECK: pred.store.continue40: ; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i1> [[TMP23]], i32 1 -; CHECK-NEXT: br i1 [[TMP68]], label [[PRED_STORE_IF46:%.*]], label [[PRED_STORE_CONTINUE47:%.*]] -; CHECK: pred.store.if46: +; CHECK-NEXT: br i1 [[TMP68]], label [[PRED_STORE_IF41:%.*]], label [[PRED_STORE_CONTINUE42:%.*]] +; CHECK: pred.store.if41: ; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP20]] ; CHECK-NEXT: [[TMP70:%.*]] = extractelement <4 x i32> [[TMP64]], i32 1 ; CHECK-NEXT: store i32 [[TMP70]], i32* [[TMP69]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE47]] -; CHECK: pred.store.continue47: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE42]] +; CHECK: pred.store.continue42: ; CHECK-NEXT: [[TMP71:%.*]] = extractelement <4 x i1> [[TMP23]], i32 2 -; CHECK-NEXT: br i1 [[TMP71]], label [[PRED_STORE_IF48:%.*]], label [[PRED_STORE_CONTINUE49:%.*]] -; CHECK: pred.store.if48: +; CHECK-NEXT: br i1 [[TMP71]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44:%.*]] +; CHECK: pred.store.if43: ; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP21]] ; CHECK-NEXT: [[TMP73:%.*]] = extractelement <4 x i32> [[TMP64]], i32 2 ; CHECK-NEXT: store i32 [[TMP73]], i32* [[TMP72]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE49]] -; CHECK: pred.store.continue49: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE44]] +; CHECK: pred.store.continue44: ; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x i1> [[TMP23]], i32 3 -; CHECK-NEXT: br i1 [[TMP74]], label [[PRED_STORE_IF50:%.*]], label [[PRED_STORE_CONTINUE51]] -; CHECK: pred.store.if50: +; CHECK-NEXT: br i1 [[TMP74]], label [[PRED_STORE_IF45:%.*]], label [[PRED_STORE_CONTINUE46]] +; CHECK: pred.store.if45: ; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP22]] ; CHECK-NEXT: [[TMP76:%.*]] = extractelement <4 x i32> [[TMP64]], i32 3 ; CHECK-NEXT: store i32 [[TMP76]], i32* [[TMP75]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE51]] -; CHECK: pred.store.continue51: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE46]] +; CHECK: pred.store.continue46: ; CHECK-NEXT: [[INDEX_NEXT15]] = add i64 [[INDEX14]], 4 ; CHECK-NEXT: [[TMP77:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC13]] ; CHECK-NEXT: br i1 [[TMP77]], label [[MIDDLE_BLOCK7:%.*]], label [[VECTOR_BODY9]], !llvm.loop !5 @@ -329,15 +329,15 @@ ; CHECK: vector.ph: ; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw nsw i64 [[TMP3]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N_RND_UP]], 8589934588 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x i64> undef, i64 [[TMP3]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT14]], <4 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[TMP3]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE27:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE25:%.*]] ] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT12]], <4 x i64> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: [[VEC_IV:%.*]] = or <4 x i64> [[BROADCAST_SPLAT13]], -; CHECK-NEXT: [[TMP4:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT15]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0 ; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -347,32 +347,32 @@ ; CHECK: pred.load.continue: ; CHECK-NEXT: [[TMP7:%.*]] = phi i32 [ undef, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_LOAD_IF]] ] ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1 -; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]] -; CHECK: pred.load.if16: +; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]] +; CHECK: pred.load.if14: ; CHECK-NEXT: [[TMP9:%.*]] = or i64 [[INDEX]], 1 ; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i32, i32* [[Q]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[NEXT_GEP9]], align 16 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE17]] -; CHECK: pred.load.continue17: -; CHECK-NEXT: [[TMP11:%.*]] = phi i32 [ undef, [[PRED_LOAD_CONTINUE]] ], [ [[TMP10]], [[PRED_LOAD_IF16]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE15]] +; CHECK: pred.load.continue15: +; CHECK-NEXT: [[TMP11:%.*]] = phi i32 [ undef, [[PRED_LOAD_CONTINUE]] ], [ [[TMP10]], [[PRED_LOAD_IF14]] ] ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2 -; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]] -; CHECK: pred.load.if18: +; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]] +; CHECK: pred.load.if16: ; CHECK-NEXT: [[TMP13:%.*]] = or i64 [[INDEX]], 2 ; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i32, i32* [[Q]], i64 [[TMP13]] ; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[NEXT_GEP10]], align 16 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE19]] -; CHECK: pred.load.continue19: -; CHECK-NEXT: [[TMP15:%.*]] = phi i32 [ undef, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP14]], [[PRED_LOAD_IF18]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE17]] +; CHECK: pred.load.continue17: +; CHECK-NEXT: [[TMP15:%.*]] = phi i32 [ undef, [[PRED_LOAD_CONTINUE15]] ], [ [[TMP14]], [[PRED_LOAD_IF16]] ] ; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3 -; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]] -; CHECK: pred.load.if20: +; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]] +; CHECK: pred.load.if18: ; CHECK-NEXT: [[TMP17:%.*]] = or i64 [[INDEX]], 3 ; CHECK-NEXT: [[NEXT_GEP11:%.*]] = getelementptr i32, i32* [[Q]], i64 [[TMP17]] ; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[NEXT_GEP11]], align 16 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE21]] -; CHECK: pred.load.continue21: -; CHECK-NEXT: [[TMP19:%.*]] = phi i32 [ undef, [[PRED_LOAD_CONTINUE19]] ], [ [[TMP18]], [[PRED_LOAD_IF20]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE19]] +; CHECK: pred.load.continue19: +; CHECK-NEXT: [[TMP19:%.*]] = phi i32 [ undef, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP18]], [[PRED_LOAD_IF18]] ] ; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0 ; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: @@ -381,29 +381,29 @@ ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: ; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1 -; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]] -; CHECK: pred.store.if22: +; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]] +; CHECK: pred.store.if20: ; CHECK-NEXT: [[TMP22:%.*]] = or i64 [[INDEX]], 1 ; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i32, i32* [[P]], i64 [[TMP22]] ; CHECK-NEXT: store i32 [[TMP11]], i32* [[NEXT_GEP5]], align 16 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE23]] -; CHECK: pred.store.continue23: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE21]] +; CHECK: pred.store.continue21: ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2 -; CHECK-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]] -; CHECK: pred.store.if24: +; CHECK-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]] +; CHECK: pred.store.if22: ; CHECK-NEXT: [[TMP24:%.*]] = or i64 [[INDEX]], 2 ; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i32, i32* [[P]], i64 [[TMP24]] ; CHECK-NEXT: store i32 [[TMP15]], i32* [[NEXT_GEP6]], align 16 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE25]] -; CHECK: pred.store.continue25: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE23]] +; CHECK: pred.store.continue23: ; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3 -; CHECK-NEXT: br i1 [[TMP25]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27]] -; CHECK: pred.store.if26: +; CHECK-NEXT: br i1 [[TMP25]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25]] +; CHECK: pred.store.if24: ; CHECK-NEXT: [[TMP26:%.*]] = or i64 [[INDEX]], 3 ; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i32, i32* [[P]], i64 [[TMP26]] ; CHECK-NEXT: store i32 [[TMP19]], i32* [[NEXT_GEP7]], align 16 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE27]] -; CHECK: pred.store.continue27: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE25]] +; CHECK: pred.store.continue25: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8 @@ -535,9 +535,7 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE22:%.*]] ] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = or <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i64> [[INDUCTION]], +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i64> [[BROADCAST_SPLATINSERT]], ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 ; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -546,7 +544,7 @@ ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: ; CHECK-NEXT: [[TMP4:%.*]] = phi i16 [ undef, [[VECTOR_BODY]] ], [ [[TMP3]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 ; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] ; CHECK: pred.load.if11: ; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[INDEX]], 1 @@ -555,7 +553,7 @@ ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE12]] ; CHECK: pred.load.continue12: ; CHECK-NEXT: [[TMP8:%.*]] = phi i16 [ undef, [[PRED_LOAD_CONTINUE]] ], [ [[TMP7]], [[PRED_LOAD_IF11]] ] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 ; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] ; CHECK: pred.load.if13: ; CHECK-NEXT: [[TMP10:%.*]] = or i64 [[INDEX]], 2 @@ -564,7 +562,7 @@ ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE14]] ; CHECK: pred.load.continue14: ; CHECK-NEXT: [[TMP12:%.*]] = phi i16 [ undef, [[PRED_LOAD_CONTINUE12]] ], [ [[TMP11]], [[PRED_LOAD_IF13]] ] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 ; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] ; CHECK: pred.load.if15: ; CHECK-NEXT: [[TMP14:%.*]] = or i64 [[INDEX]], 3 @@ -582,7 +580,7 @@ ; CHECK-NEXT: store i32 [[TMP19]], i32* [[NEXT_GEP7]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 ; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] ; CHECK: pred.store.if17: ; CHECK-NEXT: [[TMP21:%.*]] = zext i16 [[TMP8]] to i32 @@ -592,7 +590,7 @@ ; CHECK-NEXT: store i32 [[TMP22]], i32* [[NEXT_GEP8]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE18]] ; CHECK: pred.store.continue18: -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 ; CHECK-NEXT: br i1 [[TMP24]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] ; CHECK: pred.store.if19: ; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP12]] to i32 @@ -602,7 +600,7 @@ ; CHECK-NEXT: store i32 [[TMP26]], i32* [[NEXT_GEP9]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE20]] ; CHECK: pred.store.continue20: -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 ; CHECK-NEXT: br i1 [[TMP28]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22]] ; CHECK: pred.store.if21: ; CHECK-NEXT: [[TMP29:%.*]] = zext i16 [[TMP16]] to i32 Index: llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll +++ llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll @@ -18,10 +18,7 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 Index: llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll +++ llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll @@ -24,21 +24,18 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !0 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !0 ; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] ; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4 +; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !1 @@ -97,12 +94,11 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[TMP0]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> [[INDUCTION]], +; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <8 x float>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x float> undef), !llvm.access.group !6 @@ -112,13 +108,26 @@ ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP7]], i32 4, <8 x i1> [[TMP2]], <8 x float> undef), !llvm.access.group !6 ; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]] ; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP6]] to <8 x float>* -; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP8]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP2]]) +; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP8]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP2]]), !llvm.access.group !6 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !7 ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !6 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !6 +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP11]], [[TMP12]] +; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !6 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !9 ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -156,21 +165,18 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !6 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !6 ; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] ; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4 +; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !6 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10 Index: llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll +++ llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll @@ -1,5 +1,6 @@ -; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED -; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED +; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" target triple = "i386-unknown-linux-gnu" @@ -28,34 +29,149 @@ ; } ; } -;DISABLED_MASKED_STRIDED-LABEL: @masked_strided1( -;DISABLED_MASKED_STRIDED: vector.body: -;DISABLED_MASKED_STRIDED-NEXT: %index = phi i32 -;DISABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ -;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask = -;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load. -;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %[[WIDEVEC]], <16 x i8> undef, <8 x i32> -;DISABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}} -;DISABLED_MASKED_STRIDED-NEXT: %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], -;DISABLED_MASKED_STRIDED-NEXT: %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0 -;DISABLED_MASKED_STRIDED-NEXT: br i1 %[[M]], label %pred.load.if, label %pred.load.continue -;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask = -;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load. -;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> -;DISABLED_MASKED_STRIDED-NOT: for.body: -;DISABLED_MASKED_STRIDED: for.end: - -;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1( -;ENABLED_MASKED_STRIDED: vector.body: -;ENABLED_MASKED_STRIDED-NEXT: %index = phi i32 -;ENABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ -;ENABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}} -;ENABLED_MASKED_STRIDED: %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> -;ENABLED_MASKED_STRIDED-NEXT: %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef) -;ENABLED_MASKED_STRIDED-NEXT: %[[STRIDEDVEC:.+]] = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> -;ENABLED_MASKED_STRIDED: for.body: - define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr { +; DISABLED_MASKED_STRIDED-LABEL: @masked_strided1( +; DISABLED_MASKED_STRIDED-NEXT: entry: +; DISABLED_MASKED_STRIDED-NEXT: [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32 +; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; DISABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] +; DISABLED_MASKED_STRIDED: vector.body: +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE14:%.*]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE14]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP3]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = insertelement <8 x i8> undef, i8 [[TMP5]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; DISABLED_MASKED_STRIDED: pred.load.continue: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = phi <8 x i8> [ undef, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_LOAD_IF]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP8]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if1: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP9]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = insertelement <8 x i8> [[TMP7]], i8 [[TMP11]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; DISABLED_MASKED_STRIDED: pred.load.continue2: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = phi <8 x i8> [ [[TMP7]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF1]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP14]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if3: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP15]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = load i8, i8* [[TMP16]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = insertelement <8 x i8> [[TMP13]], i8 [[TMP17]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; DISABLED_MASKED_STRIDED: pred.load.continue4: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = phi <8 x i8> [ [[TMP13]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP18]], [[PRED_LOAD_IF3]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP20]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if5: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP21]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = load i8, i8* [[TMP22]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = insertelement <8 x i8> [[TMP19]], i8 [[TMP23]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; DISABLED_MASKED_STRIDED: pred.load.continue6: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = phi <8 x i8> [ [[TMP19]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP26]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if7: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP27]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = load i8, i8* [[TMP28]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP30:%.*]] = insertelement <8 x i8> [[TMP25]], i8 [[TMP29]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE8]] +; DISABLED_MASKED_STRIDED: pred.load.continue8: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP31:%.*]] = phi <8 x i8> [ [[TMP25]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP30]], [[PRED_LOAD_IF7]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP32:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if9: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP33:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP33]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP35:%.*]] = load i8, i8* [[TMP34]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP36:%.*]] = insertelement <8 x i8> [[TMP31]], i8 [[TMP35]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE10]] +; DISABLED_MASKED_STRIDED: pred.load.continue10: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP37:%.*]] = phi <8 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP36]], [[PRED_LOAD_IF9]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP38:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP38]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if11: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP39:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP39]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP41:%.*]] = load i8, i8* [[TMP40]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP42:%.*]] = insertelement <8 x i8> [[TMP37]], i8 [[TMP41]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE12]] +; DISABLED_MASKED_STRIDED: pred.load.continue12: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP43:%.*]] = phi <8 x i8> [ [[TMP37]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP42]], [[PRED_LOAD_IF11]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP44:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP44]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14]] +; DISABLED_MASKED_STRIDED: pred.load.if13: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP45:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP45]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP47:%.*]] = load i8, i8* [[TMP46]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP48:%.*]] = insertelement <8 x i8> [[TMP43]], i8 [[TMP47]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE14]] +; DISABLED_MASKED_STRIDED: pred.load.continue14: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP49:%.*]] = phi <8 x i8> [ [[TMP43]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP48]], [[PRED_LOAD_IF13]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP50:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP51:%.*]] = bitcast i8* [[TMP50]] to <8 x i8>* +; DISABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[TMP49]], <8 x i8>* [[TMP51]], i32 1, <8 x i1> [[TMP0]]) +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP52]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; DISABLED_MASKED_STRIDED: for.end: +; DISABLED_MASKED_STRIDED-NEXT: ret void +; +; ENABLED_MASKED_STRIDED-LABEL: @masked_strided1( +; ENABLED_MASKED_STRIDED-NEXT: entry: +; ENABLED_MASKED_STRIDED-NEXT: [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] +; ENABLED_MASKED_STRIDED: vector.body: +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> undef, <16 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> undef) +; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <8 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP5]], i32 1, <8 x i1> [[TMP0]]) +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1016 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP6]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; ENABLED_MASKED_STRIDED: for.body: +; ENABLED_MASKED_STRIDED-NEXT: [[IX_09:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 1016, [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[CMP1:%.*]] = icmp ugt i32 [[IX_09]], [[CONV]] +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; ENABLED_MASKED_STRIDED: if.then: +; ENABLED_MASKED_STRIDED-NEXT: [[MUL:%.*]] = shl nuw nsw i32 [[IX_09]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[MUL]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[IX_09]] +; ENABLED_MASKED_STRIDED-NEXT: store i8 [[TMP7]], i8* [[ARRAYIDX3]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: br label [[FOR_INC]] +; ENABLED_MASKED_STRIDED: for.inc: +; ENABLED_MASKED_STRIDED-NEXT: [[INC]] = add nuw nsw i32 [[IX_09]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1024 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !2 +; ENABLED_MASKED_STRIDED: for.end: +; ENABLED_MASKED_STRIDED-NEXT: ret void +; entry: %conv = zext i8 %guard to i32 br label %for.body @@ -88,37 +204,136 @@ ; When enable-masked-interleaved-access is enabled, the interleave-groups will ; be vectorized with masked wide-loads with the mask properly shuffled and ; And-ed with the gaps mask. - -;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize( -;ENABLED_MASKED_STRIDED-NEXT: entry: -;ENABLED_MASKED_STRIDED-NEXT: [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32 -;ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0 -;ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer -;ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] -;ENABLED_MASKED_STRIDED: vector.body: -;ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -;ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -;ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -;ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1 -;ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]] -;ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>* -;ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> undef, <16 x i32> -;ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], -;ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[TMP4]], <16 x i8> undef) -;ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> -;ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] -;ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>* -;ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP6]], i32 1, <8 x i1> [[TMP0]]) -;ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 -;ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], -;ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 -;ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP7]] -;ENABLED_MASKED_STRIDED-NOT: for.body: -;ENABLED_MASKED_STRIDED: for.end: -;ENABLED_MASKED_STRIDED-NEXT: ret void - - +; define dso_local void @masked_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize { +; DISABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize( +; DISABLED_MASKED_STRIDED-NEXT: entry: +; DISABLED_MASKED_STRIDED-NEXT: [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32 +; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; DISABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] +; DISABLED_MASKED_STRIDED: vector.body: +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE14:%.*]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE14]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP3]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = insertelement <8 x i8> undef, i8 [[TMP5]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; DISABLED_MASKED_STRIDED: pred.load.continue: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = phi <8 x i8> [ undef, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_LOAD_IF]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP8]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if1: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP9]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = insertelement <8 x i8> [[TMP7]], i8 [[TMP11]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; DISABLED_MASKED_STRIDED: pred.load.continue2: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = phi <8 x i8> [ [[TMP7]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF1]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP14]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if3: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP15]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = load i8, i8* [[TMP16]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = insertelement <8 x i8> [[TMP13]], i8 [[TMP17]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; DISABLED_MASKED_STRIDED: pred.load.continue4: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = phi <8 x i8> [ [[TMP13]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP18]], [[PRED_LOAD_IF3]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP20]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if5: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP21]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = load i8, i8* [[TMP22]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = insertelement <8 x i8> [[TMP19]], i8 [[TMP23]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; DISABLED_MASKED_STRIDED: pred.load.continue6: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = phi <8 x i8> [ [[TMP19]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP26]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if7: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP27]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = load i8, i8* [[TMP28]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP30:%.*]] = insertelement <8 x i8> [[TMP25]], i8 [[TMP29]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE8]] +; DISABLED_MASKED_STRIDED: pred.load.continue8: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP31:%.*]] = phi <8 x i8> [ [[TMP25]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP30]], [[PRED_LOAD_IF7]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP32:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if9: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP33:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP33]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP35:%.*]] = load i8, i8* [[TMP34]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP36:%.*]] = insertelement <8 x i8> [[TMP31]], i8 [[TMP35]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE10]] +; DISABLED_MASKED_STRIDED: pred.load.continue10: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP37:%.*]] = phi <8 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP36]], [[PRED_LOAD_IF9]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP38:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP38]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if11: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP39:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP39]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP41:%.*]] = load i8, i8* [[TMP40]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP42:%.*]] = insertelement <8 x i8> [[TMP37]], i8 [[TMP41]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE12]] +; DISABLED_MASKED_STRIDED: pred.load.continue12: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP43:%.*]] = phi <8 x i8> [ [[TMP37]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP42]], [[PRED_LOAD_IF11]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP44:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP44]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14]] +; DISABLED_MASKED_STRIDED: pred.load.if13: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP45:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP45]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP47:%.*]] = load i8, i8* [[TMP46]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP48:%.*]] = insertelement <8 x i8> [[TMP43]], i8 [[TMP47]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE14]] +; DISABLED_MASKED_STRIDED: pred.load.continue14: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP49:%.*]] = phi <8 x i8> [ [[TMP43]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP48]], [[PRED_LOAD_IF13]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP50:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP51:%.*]] = bitcast i8* [[TMP50]] to <8 x i8>* +; DISABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[TMP49]], <8 x i8>* [[TMP51]], i32 1, <8 x i1> [[TMP0]]) +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP52]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !2 +; DISABLED_MASKED_STRIDED: for.end: +; DISABLED_MASKED_STRIDED-NEXT: ret void +; +; ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize( +; ENABLED_MASKED_STRIDED-NEXT: entry: +; ENABLED_MASKED_STRIDED-NEXT: [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] +; ENABLED_MASKED_STRIDED: vector.body: +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> undef, <16 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[TMP4]], <16 x i8> undef) +; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP6]], i32 1, <8 x i1> [[TMP0]]) +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP7]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 +; ENABLED_MASKED_STRIDED: for.end: +; ENABLED_MASKED_STRIDED-NEXT: ret void +; entry: %conv = zext i8 %guard to i32 br label %for.body @@ -171,14 +386,28 @@ ; } ; } ; } - +; +define dso_local void @masked_strided1_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard, i32 %n) local_unnamed_addr optsize { ; DISABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize_unknown_tc( +; DISABLED_MASKED_STRIDED-NEXT: entry: +; DISABLED_MASKED_STRIDED-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]] +; DISABLED_MASKED_STRIDED: vector.ph: +; DISABLED_MASKED_STRIDED-NEXT: [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32 +; DISABLED_MASKED_STRIDED-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 +; DISABLED_MASKED_STRIDED-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 +; DISABLED_MASKED_STRIDED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 +; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer +; DISABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; DISABLED_MASKED_STRIDED: vector.body: -; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 -; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ -; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], {{.*}} +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE16:%.*]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE16]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], -; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}} +; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = and <8 x i1> [[TMP0]], [[TMP2]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] @@ -188,11 +417,88 @@ ; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP6]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> undef, i8 [[TMP7]], i32 0 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]] -; DISABLED_MASKED_STRIDED-NOT: for.body: +; DISABLED_MASKED_STRIDED: pred.load.continue: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = phi <8 x i8> [ undef, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP3]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if3: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP11]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP12]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP9]], i8 [[TMP13]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; DISABLED_MASKED_STRIDED: pred.load.continue4: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = phi <8 x i8> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP3]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if5: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP17]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = load i8, i8* [[TMP18]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = insertelement <8 x i8> [[TMP15]], i8 [[TMP19]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; DISABLED_MASKED_STRIDED: pred.load.continue6: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = phi <8 x i8> [ [[TMP15]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP20]], [[PRED_LOAD_IF5]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP3]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if7: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP23]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = load i8, i8* [[TMP24]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = insertelement <8 x i8> [[TMP21]], i8 [[TMP25]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE8]] +; DISABLED_MASKED_STRIDED: pred.load.continue8: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = phi <8 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP26]], [[PRED_LOAD_IF7]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP3]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP28]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if9: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP29]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP31:%.*]] = load i8, i8* [[TMP30]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP32:%.*]] = insertelement <8 x i8> [[TMP27]], i8 [[TMP31]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE10]] +; DISABLED_MASKED_STRIDED: pred.load.continue10: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP33:%.*]] = phi <8 x i8> [ [[TMP27]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP32]], [[PRED_LOAD_IF9]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP34:%.*]] = extractelement <8 x i1> [[TMP3]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP34]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if11: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP35:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP35]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP37:%.*]] = load i8, i8* [[TMP36]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP38:%.*]] = insertelement <8 x i8> [[TMP33]], i8 [[TMP37]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE12]] +; DISABLED_MASKED_STRIDED: pred.load.continue12: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP39:%.*]] = phi <8 x i8> [ [[TMP33]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP38]], [[PRED_LOAD_IF11]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP40:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP40]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if13: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP41:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP41]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP43:%.*]] = load i8, i8* [[TMP42]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP44:%.*]] = insertelement <8 x i8> [[TMP39]], i8 [[TMP43]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE14]] +; DISABLED_MASKED_STRIDED: pred.load.continue14: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP45:%.*]] = phi <8 x i8> [ [[TMP39]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP44]], [[PRED_LOAD_IF13]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP46:%.*]] = extractelement <8 x i1> [[TMP3]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP46]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16]] +; DISABLED_MASKED_STRIDED: pred.load.if15: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP47:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP47]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP49:%.*]] = load i8, i8* [[TMP48]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP50:%.*]] = insertelement <8 x i8> [[TMP45]], i8 [[TMP49]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE16]] +; DISABLED_MASKED_STRIDED: pred.load.continue16: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP51:%.*]] = phi <8 x i8> [ [[TMP45]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP50]], [[PRED_LOAD_IF15]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP53:%.*]] = bitcast i8* [[TMP52]] to <8 x i8>* +; DISABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[TMP51]], <8 x i8>* [[TMP53]], i32 1, <8 x i1> [[TMP3]]) +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP54:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP54]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop !3 ; DISABLED_MASKED_STRIDED: for.end: ; DISABLED_MASKED_STRIDED-NEXT: ret void - - +; ; ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize_unknown_tc( ; ENABLED_MASKED_STRIDED-NEXT: entry: ; ENABLED_MASKED_STRIDED-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -202,18 +508,18 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 ; ENABLED_MASKED_STRIDED-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 ; ENABLED_MASKED_STRIDED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 -; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 -; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer -; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer ; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; ENABLED_MASKED_STRIDED: vector.body: ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <16 x i32> @@ -226,12 +532,10 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], ; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP9]], label [[FOR_END]], label [[VECTOR_BODY]] -; ENABLED_MASKED_STRIDED-NOT: for.body: +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP9]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop !5 ; ENABLED_MASKED_STRIDED: for.end: ; ENABLED_MASKED_STRIDED-NEXT: ret void - -define dso_local void @masked_strided1_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard, i32 %n) local_unnamed_addr optsize { +; entry: %cmp9 = icmp sgt i32 %n, 0 br i1 %cmp9, label %for.body.lr.ph, label %for.end @@ -289,8 +593,119 @@ ; } ; } ; } - - +; +define dso_local void @masked_strided3_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard, i32 %n) local_unnamed_addr optsize { +; DISABLED_MASKED_STRIDED-LABEL: @masked_strided3_optsize_unknown_tc( +; DISABLED_MASKED_STRIDED-NEXT: entry: +; DISABLED_MASKED_STRIDED-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]] +; DISABLED_MASKED_STRIDED: vector.ph: +; DISABLED_MASKED_STRIDED-NEXT: [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32 +; DISABLED_MASKED_STRIDED-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 +; DISABLED_MASKED_STRIDED-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 +; DISABLED_MASKED_STRIDED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 +; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer +; DISABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] +; DISABLED_MASKED_STRIDED: vector.body: +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE16:%.*]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE16]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = mul nsw <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = and <8 x i1> [[TMP0]], [[TMP2]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP5]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP6]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> undef, i8 [[TMP7]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; DISABLED_MASKED_STRIDED: pred.load.continue: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = phi <8 x i8> [ undef, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP3]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if3: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP11]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP12]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP9]], i8 [[TMP13]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; DISABLED_MASKED_STRIDED: pred.load.continue4: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = phi <8 x i8> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP3]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if5: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP17]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = load i8, i8* [[TMP18]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = insertelement <8 x i8> [[TMP15]], i8 [[TMP19]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; DISABLED_MASKED_STRIDED: pred.load.continue6: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = phi <8 x i8> [ [[TMP15]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP20]], [[PRED_LOAD_IF5]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP3]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if7: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP23]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = load i8, i8* [[TMP24]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = insertelement <8 x i8> [[TMP21]], i8 [[TMP25]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE8]] +; DISABLED_MASKED_STRIDED: pred.load.continue8: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = phi <8 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP26]], [[PRED_LOAD_IF7]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP3]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP28]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if9: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP29]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP31:%.*]] = load i8, i8* [[TMP30]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP32:%.*]] = insertelement <8 x i8> [[TMP27]], i8 [[TMP31]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE10]] +; DISABLED_MASKED_STRIDED: pred.load.continue10: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP33:%.*]] = phi <8 x i8> [ [[TMP27]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP32]], [[PRED_LOAD_IF9]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP34:%.*]] = extractelement <8 x i1> [[TMP3]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP34]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if11: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP35:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP35]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP37:%.*]] = load i8, i8* [[TMP36]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP38:%.*]] = insertelement <8 x i8> [[TMP33]], i8 [[TMP37]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE12]] +; DISABLED_MASKED_STRIDED: pred.load.continue12: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP39:%.*]] = phi <8 x i8> [ [[TMP33]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP38]], [[PRED_LOAD_IF11]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP40:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP40]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if13: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP41:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP41]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP43:%.*]] = load i8, i8* [[TMP42]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP44:%.*]] = insertelement <8 x i8> [[TMP39]], i8 [[TMP43]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE14]] +; DISABLED_MASKED_STRIDED: pred.load.continue14: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP45:%.*]] = phi <8 x i8> [ [[TMP39]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP44]], [[PRED_LOAD_IF13]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP46:%.*]] = extractelement <8 x i1> [[TMP3]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP46]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16]] +; DISABLED_MASKED_STRIDED: pred.load.if15: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP47:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP47]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP49:%.*]] = load i8, i8* [[TMP48]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP50:%.*]] = insertelement <8 x i8> [[TMP45]], i8 [[TMP49]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE16]] +; DISABLED_MASKED_STRIDED: pred.load.continue16: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP51:%.*]] = phi <8 x i8> [ [[TMP45]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP50]], [[PRED_LOAD_IF15]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP53:%.*]] = bitcast i8* [[TMP52]] to <8 x i8>* +; DISABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[TMP51]], <8 x i8>* [[TMP53]], i32 1, <8 x i1> [[TMP3]]) +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP54:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP54]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop !4 +; DISABLED_MASKED_STRIDED: for.end: +; DISABLED_MASKED_STRIDED-NEXT: ret void +; ; ENABLED_MASKED_STRIDED-LABEL: @masked_strided3_optsize_unknown_tc( ; ENABLED_MASKED_STRIDED-NEXT: entry: ; ENABLED_MASKED_STRIDED-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -300,18 +715,18 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 ; ENABLED_MASKED_STRIDED-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 ; ENABLED_MASKED_STRIDED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 -; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 -; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer -; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer ; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; ENABLED_MASKED_STRIDED: vector.body: ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = mul nsw i32 [[INDEX]], 3 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <24 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <24 x i32> @@ -324,11 +739,10 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], ; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP9]], label [[FOR_END]], label [[VECTOR_BODY]] +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP9]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop !6 ; ENABLED_MASKED_STRIDED: for.end: ; ENABLED_MASKED_STRIDED-NEXT: ret void ; -define dso_local void @masked_strided3_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard, i32 %n) local_unnamed_addr optsize { entry: %cmp9 = icmp sgt i32 %n, 0 br i1 %cmp9, label %for.body.lr.ph, label %for.end @@ -364,7 +778,7 @@ ; Back to stride 2 with gaps with a known trip count under opt for size, -; but this time the load/store are not predicated. +; but this time the load/store are not predicated. ; When enable-masked-interleaved-access is disabled, the interleave-groups will ; be invalidated during cost-model checks because we have gaps and we can't ; create an epilog. The access is thus scalarized. @@ -382,36 +796,76 @@ ; q[ix] = t; ; } ; } - -;DISABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize( -;DISABLED_MASKED_STRIDED: vector.body: -;DISABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8 -;DISABLED_MASKED_STRIDED: %{{.*}} = extractelement <8 x i32> %{{.*}}, i32 0 -;DISABLED_MASKED_STRIDED-NOT: for.body: -;DISABLED_MASKED_STRIDED: for.end: - -;ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize( -;ENABLED_MASKED_STRIDED-NEXT: entry: -;ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] -;ENABLED_MASKED_STRIDED: vector.body: -;ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -;ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1 -;ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]] -;ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>* -;ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP2]], i32 1, <16 x i1> , <16 x i8> undef) -;ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> -;ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] -;ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <8 x i8>* -;ENABLED_MASKED_STRIDED-NEXT: store <8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP4]], align 1 -;ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 -;ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 -;ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]] -;ENABLED_MASKED_STRIDED-NOT: for.body: -;ENABLED_MASKED_STRIDED: for.end: -;ENABLED_MASKED_STRIDED-NEXT: ret void - - +; define dso_local void @unconditional_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize { +; DISABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize( +; DISABLED_MASKED_STRIDED-NEXT: entry: +; DISABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] +; DISABLED_MASKED_STRIDED: vector.body: +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = extractelement <8 x i32> [[TMP0]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP3]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP0]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP5]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP0]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP7]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP0]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP9]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP0]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP11]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[TMP0]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP13]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[TMP0]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP15]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = load i8, i8* [[TMP2]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = load i8, i8* [[TMP4]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = load i8, i8* [[TMP6]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = load i8, i8* [[TMP8]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = load i8, i8* [[TMP10]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = load i8, i8* [[TMP12]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = load i8, i8* [[TMP14]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = load i8, i8* [[TMP16]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = insertelement <8 x i8> undef, i8 [[TMP17]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = insertelement <8 x i8> [[TMP25]], i8 [[TMP18]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = insertelement <8 x i8> [[TMP26]], i8 [[TMP19]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = insertelement <8 x i8> [[TMP27]], i8 [[TMP20]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = insertelement <8 x i8> [[TMP28]], i8 [[TMP21]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP30:%.*]] = insertelement <8 x i8> [[TMP29]], i8 [[TMP22]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP31:%.*]] = insertelement <8 x i8> [[TMP30]], i8 [[TMP23]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP32:%.*]] = insertelement <8 x i8> [[TMP31]], i8 [[TMP24]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP34:%.*]] = bitcast i8* [[TMP33]] to <8 x i8>* +; DISABLED_MASKED_STRIDED-NEXT: store <8 x i8> [[TMP32]], <8 x i8>* [[TMP34]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP35:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP35]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !5 +; DISABLED_MASKED_STRIDED: for.end: +; DISABLED_MASKED_STRIDED-NEXT: ret void +; +; ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize( +; ENABLED_MASKED_STRIDED-NEXT: entry: +; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] +; ENABLED_MASKED_STRIDED: vector.body: +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP2]], i32 1, <16 x i1> , <16 x i8> undef) +; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <8 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: store <8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP4]], align 1 +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !7 +; ENABLED_MASKED_STRIDED: for.end: +; ENABLED_MASKED_STRIDED-NEXT: ret void +; entry: br label %for.body @@ -433,7 +887,7 @@ ; Unconditioal accesses with gaps under Optsize scenario again, with unknown -; trip-count this time, in order to check the behavior of folding-the-tail +; trip-count this time, in order to check the behavior of folding-the-tail ; (folding the remainder loop into the main loop using masking) together with ; interleaved-groups. Folding-the-tail turns the accesses to conditional which ; requires proper masking. In addition we need to mask out the gaps (all @@ -450,13 +904,24 @@ ; char t = p[2*ix]; ; q[ix] = t; ; } - +; +define dso_local void @unconditional_strided1_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %n) local_unnamed_addr optsize { ; DISABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize_unknown_tc( +; DISABLED_MASKED_STRIDED-NEXT: entry: +; DISABLED_MASKED_STRIDED-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]] +; DISABLED_MASKED_STRIDED: vector.ph: +; DISABLED_MASKED_STRIDED-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 +; DISABLED_MASKED_STRIDED-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 +; DISABLED_MASKED_STRIDED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 +; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; DISABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; DISABLED_MASKED_STRIDED: vector.body: -; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 -; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE14:%.*]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE14]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], -; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}} +; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if: @@ -465,10 +930,88 @@ ; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = insertelement <8 x i8> undef, i8 [[TMP5]], i32 0 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]] -; DISBLED_MASKED_STRIDED-NOT: for.body: +; DISABLED_MASKED_STRIDED: pred.load.continue: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = phi <8 x i8> [ undef, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_LOAD_IF]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP8]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if1: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP0]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP9]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = insertelement <8 x i8> [[TMP7]], i8 [[TMP11]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; DISABLED_MASKED_STRIDED: pred.load.continue2: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = phi <8 x i8> [ [[TMP7]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF1]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP14]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if3: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[TMP0]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP15]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = load i8, i8* [[TMP16]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = insertelement <8 x i8> [[TMP13]], i8 [[TMP17]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; DISABLED_MASKED_STRIDED: pred.load.continue4: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = phi <8 x i8> [ [[TMP13]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP18]], [[PRED_LOAD_IF3]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP20]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if5: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = extractelement <8 x i32> [[TMP0]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP21]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = load i8, i8* [[TMP22]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = insertelement <8 x i8> [[TMP19]], i8 [[TMP23]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; DISABLED_MASKED_STRIDED: pred.load.continue6: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = phi <8 x i8> [ [[TMP19]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP26]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if7: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = extractelement <8 x i32> [[TMP0]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP27]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = load i8, i8* [[TMP28]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP30:%.*]] = insertelement <8 x i8> [[TMP25]], i8 [[TMP29]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE8]] +; DISABLED_MASKED_STRIDED: pred.load.continue8: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP31:%.*]] = phi <8 x i8> [ [[TMP25]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP30]], [[PRED_LOAD_IF7]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP32:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if9: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP33:%.*]] = extractelement <8 x i32> [[TMP0]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP33]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP35:%.*]] = load i8, i8* [[TMP34]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP36:%.*]] = insertelement <8 x i8> [[TMP31]], i8 [[TMP35]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE10]] +; DISABLED_MASKED_STRIDED: pred.load.continue10: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP37:%.*]] = phi <8 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP36]], [[PRED_LOAD_IF9]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP38:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP38]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if11: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP39:%.*]] = extractelement <8 x i32> [[TMP0]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP39]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP41:%.*]] = load i8, i8* [[TMP40]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP42:%.*]] = insertelement <8 x i8> [[TMP37]], i8 [[TMP41]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE12]] +; DISABLED_MASKED_STRIDED: pred.load.continue12: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP43:%.*]] = phi <8 x i8> [ [[TMP37]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP42]], [[PRED_LOAD_IF11]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP44:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP44]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14]] +; DISABLED_MASKED_STRIDED: pred.load.if13: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP45:%.*]] = extractelement <8 x i32> [[TMP0]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP45]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP47:%.*]] = load i8, i8* [[TMP46]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP48:%.*]] = insertelement <8 x i8> [[TMP43]], i8 [[TMP47]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE14]] +; DISABLED_MASKED_STRIDED: pred.load.continue14: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP49:%.*]] = phi <8 x i8> [ [[TMP43]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP48]], [[PRED_LOAD_IF13]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP50:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP51:%.*]] = bitcast i8* [[TMP50]] to <8 x i8>* +; DISABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[TMP49]], <8 x i8>* [[TMP51]], i32 1, <8 x i1> [[TMP1]]) +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP52]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop !6 ; DISABLED_MASKED_STRIDED: for.end: ; DISABLED_MASKED_STRIDED-NEXT: ret void - +; ; ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize_unknown_tc( ; ENABLED_MASKED_STRIDED-NEXT: entry: ; ENABLED_MASKED_STRIDED-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 @@ -477,33 +1020,29 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 ; ENABLED_MASKED_STRIDED-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 ; ENABLED_MASKED_STRIDED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 -; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 -; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 ; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; ENABLED_MASKED_STRIDED: vector.body: ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[INDEX]], i32 0 -; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer -; ENABLED_MASKED_STRIDED-NEXT: [[INDUCTION:%.*]] = or <8 x i32> [[BROADCAST_SPLAT]], +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[INDEX]], i32 0 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = icmp ule <8 x i32> [[INDUCTION]], [[BROADCAST_SPLAT2]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>* -; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <16 x i32> -; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], -; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[TMP4]], <16 x i8> undef) +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = icmp ule <8 x i32> [[BROADCAST_SPLATINSERT1]], [[BROADCAST_SPLATINSERT]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <8 x i32> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <16 x i32> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP4]], i32 1, <16 x i1> [[TMP5]], <16 x i8> undef) ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> -; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>* -; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP6]], i32 1, <8 x i1> [[TMP2]]) +; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP7]], i32 1, <8 x i1> [[TMP3]]) ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP7]], label [[FOR_END]], label [[VECTOR_BODY]] -; ENABLED_MASKED_STRIDED-NOT: for.body: +; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP8]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop !8 ; ENABLED_MASKED_STRIDED: for.end: ; ENABLED_MASKED_STRIDED-NEXT: ret void - -define dso_local void @unconditional_strided1_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %n) local_unnamed_addr optsize { +; entry: %cmp6 = icmp sgt i32 %n, 0 br i1 %cmp6, label %for.body.preheader, label %for.end @@ -553,37 +1092,367 @@ ; } ; } ;} - -;DISABLED_MASKED_STRIDED-LABEL: @masked_strided2( -;DISABLED_MASKED_STRIDED: vector.body: -;DISABLED_MASKED_STRIDED-NEXT: %index = phi i32 -;DISABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ -;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask = -;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load. -;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.store. -;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> -;DISABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}} -;DISABLED_MASKED_STRIDED-NEXT: %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], -;DISABLED_MASKED_STRIDED-NEXT: %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0 -;DISABLED_MASKED_STRIDED-NEXT: br i1 %[[M]], label %pred.load.if, label %pred.load.continue -;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask = -;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load. -;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.store. -;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> - -;ENABLED_MASKED_STRIDED-LABEL: @masked_strided2( -;ENABLED_MASKED_STRIDED: vector.body: -;ENABLED_MASKED_STRIDED-NEXT: %index = phi i32 -;ENABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ -;ENABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}} -;ENABLED_MASKED_STRIDED: %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> -;ENABLED_MASKED_STRIDED-NEXT: %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef) -;ENABLED_MASKED_STRIDED-NEXT: %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> -;ENABLED_MASKED_STRIDED-NEXT: %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> -;ENABLED_MASKED_STRIDED: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask) - -; Function Attrs: norecurse nounwind +; define dso_local void @masked_strided2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr { +; DISABLED_MASKED_STRIDED-LABEL: @masked_strided2( +; DISABLED_MASKED_STRIDED-NEXT: entry: +; DISABLED_MASKED_STRIDED-NEXT: [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32 +; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; DISABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] +; DISABLED_MASKED_STRIDED: vector.body: +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE60:%.*]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE60]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP3]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = insertelement <8 x i8> undef, i8 [[TMP5]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; DISABLED_MASKED_STRIDED: pred.load.continue: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = phi <8 x i8> [ undef, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_LOAD_IF]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP8]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if1: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP9]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = insertelement <8 x i8> [[TMP7]], i8 [[TMP11]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; DISABLED_MASKED_STRIDED: pred.load.continue2: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = phi <8 x i8> [ [[TMP7]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF1]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP14]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if3: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP15]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = load i8, i8* [[TMP16]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = insertelement <8 x i8> [[TMP13]], i8 [[TMP17]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; DISABLED_MASKED_STRIDED: pred.load.continue4: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = phi <8 x i8> [ [[TMP13]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP18]], [[PRED_LOAD_IF3]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP20]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if5: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP21]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = load i8, i8* [[TMP22]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = insertelement <8 x i8> [[TMP19]], i8 [[TMP23]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; DISABLED_MASKED_STRIDED: pred.load.continue6: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = phi <8 x i8> [ [[TMP19]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP26]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if7: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP27]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = load i8, i8* [[TMP28]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP30:%.*]] = insertelement <8 x i8> [[TMP25]], i8 [[TMP29]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE8]] +; DISABLED_MASKED_STRIDED: pred.load.continue8: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP31:%.*]] = phi <8 x i8> [ [[TMP25]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP30]], [[PRED_LOAD_IF7]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP32:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if9: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP33:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP33]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP35:%.*]] = load i8, i8* [[TMP34]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP36:%.*]] = insertelement <8 x i8> [[TMP31]], i8 [[TMP35]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE10]] +; DISABLED_MASKED_STRIDED: pred.load.continue10: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP37:%.*]] = phi <8 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP36]], [[PRED_LOAD_IF9]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP38:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP38]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if11: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP39:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP39]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP41:%.*]] = load i8, i8* [[TMP40]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP42:%.*]] = insertelement <8 x i8> [[TMP37]], i8 [[TMP41]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE12]] +; DISABLED_MASKED_STRIDED: pred.load.continue12: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP43:%.*]] = phi <8 x i8> [ [[TMP37]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP42]], [[PRED_LOAD_IF11]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP44:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP44]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if13: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP45:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP45]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP47:%.*]] = load i8, i8* [[TMP46]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP48:%.*]] = insertelement <8 x i8> [[TMP43]], i8 [[TMP47]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE14]] +; DISABLED_MASKED_STRIDED: pred.load.continue14: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP49:%.*]] = phi <8 x i8> [ [[TMP43]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP48]], [[PRED_LOAD_IF13]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP50:%.*]] = or <8 x i32> [[TMP1]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP51:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP51]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if15: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = extractelement <8 x i32> [[TMP50]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP52]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP54:%.*]] = load i8, i8* [[TMP53]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP55:%.*]] = insertelement <8 x i8> undef, i8 [[TMP54]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE16]] +; DISABLED_MASKED_STRIDED: pred.load.continue16: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP56:%.*]] = phi <8 x i8> [ undef, [[PRED_LOAD_CONTINUE14]] ], [ [[TMP55]], [[PRED_LOAD_IF15]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP57:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP57]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if17: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP58:%.*]] = extractelement <8 x i32> [[TMP50]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP59:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP58]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP60:%.*]] = load i8, i8* [[TMP59]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP61:%.*]] = insertelement <8 x i8> [[TMP56]], i8 [[TMP60]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE18]] +; DISABLED_MASKED_STRIDED: pred.load.continue18: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP62:%.*]] = phi <8 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP61]], [[PRED_LOAD_IF17]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP63:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP63]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if19: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP64:%.*]] = extractelement <8 x i32> [[TMP50]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP65:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP64]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP66:%.*]] = load i8, i8* [[TMP65]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP67:%.*]] = insertelement <8 x i8> [[TMP62]], i8 [[TMP66]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE20]] +; DISABLED_MASKED_STRIDED: pred.load.continue20: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP68:%.*]] = phi <8 x i8> [ [[TMP62]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP67]], [[PRED_LOAD_IF19]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP69:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP69]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if21: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP70:%.*]] = extractelement <8 x i32> [[TMP50]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP71:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP70]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP72:%.*]] = load i8, i8* [[TMP71]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP73:%.*]] = insertelement <8 x i8> [[TMP68]], i8 [[TMP72]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE22]] +; DISABLED_MASKED_STRIDED: pred.load.continue22: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP74:%.*]] = phi <8 x i8> [ [[TMP68]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP73]], [[PRED_LOAD_IF21]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP75:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP75]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if23: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP76:%.*]] = extractelement <8 x i32> [[TMP50]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP77:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP76]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP78:%.*]] = load i8, i8* [[TMP77]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP79:%.*]] = insertelement <8 x i8> [[TMP74]], i8 [[TMP78]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE24]] +; DISABLED_MASKED_STRIDED: pred.load.continue24: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP80:%.*]] = phi <8 x i8> [ [[TMP74]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP79]], [[PRED_LOAD_IF23]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP81:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP81]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if25: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP82:%.*]] = extractelement <8 x i32> [[TMP50]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP82]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP84:%.*]] = load i8, i8* [[TMP83]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP85:%.*]] = insertelement <8 x i8> [[TMP80]], i8 [[TMP84]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE26]] +; DISABLED_MASKED_STRIDED: pred.load.continue26: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP86:%.*]] = phi <8 x i8> [ [[TMP80]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP87:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if27: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP88:%.*]] = extractelement <8 x i32> [[TMP50]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP89:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP88]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP90:%.*]] = load i8, i8* [[TMP89]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP91:%.*]] = insertelement <8 x i8> [[TMP86]], i8 [[TMP90]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE28]] +; DISABLED_MASKED_STRIDED: pred.load.continue28: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP92:%.*]] = phi <8 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP91]], [[PRED_LOAD_IF27]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP93:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP93]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if29: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP94:%.*]] = extractelement <8 x i32> [[TMP50]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP95:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP94]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP96:%.*]] = load i8, i8* [[TMP95]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP97:%.*]] = insertelement <8 x i8> [[TMP92]], i8 [[TMP96]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE30]] +; DISABLED_MASKED_STRIDED: pred.load.continue30: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP98:%.*]] = phi <8 x i8> [ [[TMP92]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP97]], [[PRED_LOAD_IF29]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP99:%.*]] = icmp slt <8 x i8> [[TMP49]], [[TMP98]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP100:%.*]] = select <8 x i1> [[TMP99]], <8 x i8> [[TMP98]], <8 x i8> [[TMP49]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP101:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP101]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP102:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP103:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[TMP102]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP104:%.*]] = extractelement <8 x i8> [[TMP100]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP104]], i8* [[TMP103]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE]] +; DISABLED_MASKED_STRIDED: pred.store.continue: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP105:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP105]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if31: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP106:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP107:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP106]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP108:%.*]] = extractelement <8 x i8> [[TMP100]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP108]], i8* [[TMP107]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE32]] +; DISABLED_MASKED_STRIDED: pred.store.continue32: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP109:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP109]], label [[PRED_STORE_IF33:%.*]], label [[PRED_STORE_CONTINUE34:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if33: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP110:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP111:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP110]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP112:%.*]] = extractelement <8 x i8> [[TMP100]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP112]], i8* [[TMP111]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE34]] +; DISABLED_MASKED_STRIDED: pred.store.continue34: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP113:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP113]], label [[PRED_STORE_IF35:%.*]], label [[PRED_STORE_CONTINUE36:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if35: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP114:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP115:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP114]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP116:%.*]] = extractelement <8 x i8> [[TMP100]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP116]], i8* [[TMP115]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE36]] +; DISABLED_MASKED_STRIDED: pred.store.continue36: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP117:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP117]], label [[PRED_STORE_IF37:%.*]], label [[PRED_STORE_CONTINUE38:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if37: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP118:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP118]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP120:%.*]] = extractelement <8 x i8> [[TMP100]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP120]], i8* [[TMP119]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE38]] +; DISABLED_MASKED_STRIDED: pred.store.continue38: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP121:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP121]], label [[PRED_STORE_IF39:%.*]], label [[PRED_STORE_CONTINUE40:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if39: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP122:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP123:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP122]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP124:%.*]] = extractelement <8 x i8> [[TMP100]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP124]], i8* [[TMP123]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE40]] +; DISABLED_MASKED_STRIDED: pred.store.continue40: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP125:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP125]], label [[PRED_STORE_IF41:%.*]], label [[PRED_STORE_CONTINUE42:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if41: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP126:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP127:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP126]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP128:%.*]] = extractelement <8 x i8> [[TMP100]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP128]], i8* [[TMP127]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE42]] +; DISABLED_MASKED_STRIDED: pred.store.continue42: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP129:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP129]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if43: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP130:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP131:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP130]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP132:%.*]] = extractelement <8 x i8> [[TMP100]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP132]], i8* [[TMP131]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE44]] +; DISABLED_MASKED_STRIDED: pred.store.continue44: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP133:%.*]] = sub <8 x i8> zeroinitializer, [[TMP100]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP134:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP134]], label [[PRED_STORE_IF45:%.*]], label [[PRED_STORE_CONTINUE46:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if45: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP135:%.*]] = extractelement <8 x i32> [[TMP50]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP136:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP135]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP137:%.*]] = extractelement <8 x i8> [[TMP133]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP137]], i8* [[TMP136]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE46]] +; DISABLED_MASKED_STRIDED: pred.store.continue46: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP138:%.*]] = extractelement <8 x i1> [[TMP0]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP138]], label [[PRED_STORE_IF47:%.*]], label [[PRED_STORE_CONTINUE48:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if47: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP139:%.*]] = extractelement <8 x i32> [[TMP50]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP140:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP139]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP141:%.*]] = extractelement <8 x i8> [[TMP133]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP141]], i8* [[TMP140]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE48]] +; DISABLED_MASKED_STRIDED: pred.store.continue48: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP142:%.*]] = extractelement <8 x i1> [[TMP0]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP142]], label [[PRED_STORE_IF49:%.*]], label [[PRED_STORE_CONTINUE50:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if49: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP143:%.*]] = extractelement <8 x i32> [[TMP50]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP143]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP145:%.*]] = extractelement <8 x i8> [[TMP133]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP145]], i8* [[TMP144]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE50]] +; DISABLED_MASKED_STRIDED: pred.store.continue50: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP146:%.*]] = extractelement <8 x i1> [[TMP0]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP146]], label [[PRED_STORE_IF51:%.*]], label [[PRED_STORE_CONTINUE52:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if51: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP147:%.*]] = extractelement <8 x i32> [[TMP50]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP148:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP147]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP149:%.*]] = extractelement <8 x i8> [[TMP133]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP149]], i8* [[TMP148]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE52]] +; DISABLED_MASKED_STRIDED: pred.store.continue52: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP150:%.*]] = extractelement <8 x i1> [[TMP0]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP150]], label [[PRED_STORE_IF53:%.*]], label [[PRED_STORE_CONTINUE54:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if53: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP151:%.*]] = extractelement <8 x i32> [[TMP50]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP152:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP151]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP153:%.*]] = extractelement <8 x i8> [[TMP133]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP153]], i8* [[TMP152]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE54]] +; DISABLED_MASKED_STRIDED: pred.store.continue54: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP154:%.*]] = extractelement <8 x i1> [[TMP0]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP154]], label [[PRED_STORE_IF55:%.*]], label [[PRED_STORE_CONTINUE56:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if55: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP155:%.*]] = extractelement <8 x i32> [[TMP50]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP156:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP155]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP157:%.*]] = extractelement <8 x i8> [[TMP133]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP157]], i8* [[TMP156]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE56]] +; DISABLED_MASKED_STRIDED: pred.store.continue56: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP158:%.*]] = extractelement <8 x i1> [[TMP0]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP158]], label [[PRED_STORE_IF57:%.*]], label [[PRED_STORE_CONTINUE58:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if57: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP159:%.*]] = extractelement <8 x i32> [[TMP50]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP160:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP159]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP161:%.*]] = extractelement <8 x i8> [[TMP133]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP161]], i8* [[TMP160]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE58]] +; DISABLED_MASKED_STRIDED: pred.store.continue58: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP162:%.*]] = extractelement <8 x i1> [[TMP0]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP162]], label [[PRED_STORE_IF59:%.*]], label [[PRED_STORE_CONTINUE60]] +; DISABLED_MASKED_STRIDED: pred.store.if59: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP163:%.*]] = extractelement <8 x i32> [[TMP50]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP163]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP165:%.*]] = extractelement <8 x i8> [[TMP133]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP165]], i8* [[TMP164]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE60]] +; DISABLED_MASKED_STRIDED: pred.store.continue60: +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP166:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP166]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !7 +; DISABLED_MASKED_STRIDED: for.end: +; DISABLED_MASKED_STRIDED-NEXT: ret void +; +; ENABLED_MASKED_STRIDED-LABEL: @masked_strided2( +; ENABLED_MASKED_STRIDED-NEXT: entry: +; ENABLED_MASKED_STRIDED-NEXT: [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] +; ENABLED_MASKED_STRIDED: vector.body: +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> undef, <16 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> undef) +; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = or i32 [[TMP1]], 1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC1]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i8> [[STRIDED_VEC1]], <8 x i8> [[STRIDED_VEC]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = sub <8 x i8> zeroinitializer, [[TMP6]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[TMP8]], i32 [[TMP4]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <16 x i32> +; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP10]], i32 1, <16 x i1> [[INTERLEAVED_MASK]]) +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP11]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !9 +; ENABLED_MASKED_STRIDED: for.end: +; ENABLED_MASKED_STRIDED-NEXT: ret void +; entry: %conv = zext i8 %guard to i32 br label %for.body @@ -643,15 +1512,363 @@ ; } ; } ;} - +; +define dso_local void @masked_strided2_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %guard, i32 %n) local_unnamed_addr optsize { +; DISABLED_MASKED_STRIDED-LABEL: @masked_strided2_unknown_tc( +; DISABLED_MASKED_STRIDED-NEXT: entry: +; DISABLED_MASKED_STRIDED-NEXT: [[CMP22:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[CMP22]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]] +; DISABLED_MASKED_STRIDED: vector.ph: +; DISABLED_MASKED_STRIDED-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 +; DISABLED_MASKED_STRIDED-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 +; DISABLED_MASKED_STRIDED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 +; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[GUARD:%.*]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer +; DISABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] +; DISABLED_MASKED_STRIDED: vector.body: +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE62:%.*]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE62]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp sgt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = and <8 x i1> [[TMP0]], [[TMP2]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP5]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP6]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = insertelement <8 x i8> undef, i8 [[TMP7]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]] +; DISABLED_MASKED_STRIDED: pred.load.continue: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = phi <8 x i8> [ undef, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = extractelement <8 x i1> [[TMP3]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if3: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP11]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP12]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP9]], i8 [[TMP13]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; DISABLED_MASKED_STRIDED: pred.load.continue4: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = phi <8 x i8> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = extractelement <8 x i1> [[TMP3]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if5: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP17]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = load i8, i8* [[TMP18]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = insertelement <8 x i8> [[TMP15]], i8 [[TMP19]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; DISABLED_MASKED_STRIDED: pred.load.continue6: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = phi <8 x i8> [ [[TMP15]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP20]], [[PRED_LOAD_IF5]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = extractelement <8 x i1> [[TMP3]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if7: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP23]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = load i8, i8* [[TMP24]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = insertelement <8 x i8> [[TMP21]], i8 [[TMP25]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE8]] +; DISABLED_MASKED_STRIDED: pred.load.continue8: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = phi <8 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP26]], [[PRED_LOAD_IF7]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = extractelement <8 x i1> [[TMP3]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP28]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if9: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP29]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP31:%.*]] = load i8, i8* [[TMP30]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP32:%.*]] = insertelement <8 x i8> [[TMP27]], i8 [[TMP31]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE10]] +; DISABLED_MASKED_STRIDED: pred.load.continue10: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP33:%.*]] = phi <8 x i8> [ [[TMP27]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP32]], [[PRED_LOAD_IF9]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP34:%.*]] = extractelement <8 x i1> [[TMP3]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP34]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if11: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP35:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP35]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP37:%.*]] = load i8, i8* [[TMP36]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP38:%.*]] = insertelement <8 x i8> [[TMP33]], i8 [[TMP37]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE12]] +; DISABLED_MASKED_STRIDED: pred.load.continue12: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP39:%.*]] = phi <8 x i8> [ [[TMP33]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP38]], [[PRED_LOAD_IF11]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP40:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP40]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if13: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP41:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP41]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP43:%.*]] = load i8, i8* [[TMP42]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP44:%.*]] = insertelement <8 x i8> [[TMP39]], i8 [[TMP43]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE14]] +; DISABLED_MASKED_STRIDED: pred.load.continue14: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP45:%.*]] = phi <8 x i8> [ [[TMP39]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP44]], [[PRED_LOAD_IF13]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP46:%.*]] = extractelement <8 x i1> [[TMP3]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP46]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if15: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP47:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP47]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP49:%.*]] = load i8, i8* [[TMP48]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP50:%.*]] = insertelement <8 x i8> [[TMP45]], i8 [[TMP49]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE16]] +; DISABLED_MASKED_STRIDED: pred.load.continue16: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP51:%.*]] = phi <8 x i8> [ [[TMP45]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP50]], [[PRED_LOAD_IF15]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = or <8 x i32> [[TMP1]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP53:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP53]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if17: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP54:%.*]] = extractelement <8 x i32> [[TMP52]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP55:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP54]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP56:%.*]] = load i8, i8* [[TMP55]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP57:%.*]] = insertelement <8 x i8> undef, i8 [[TMP56]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE18]] +; DISABLED_MASKED_STRIDED: pred.load.continue18: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP58:%.*]] = phi <8 x i8> [ undef, [[PRED_LOAD_CONTINUE16]] ], [ [[TMP57]], [[PRED_LOAD_IF17]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP59:%.*]] = extractelement <8 x i1> [[TMP3]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP59]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if19: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP60:%.*]] = extractelement <8 x i32> [[TMP52]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP61:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP60]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP62:%.*]] = load i8, i8* [[TMP61]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP63:%.*]] = insertelement <8 x i8> [[TMP58]], i8 [[TMP62]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE20]] +; DISABLED_MASKED_STRIDED: pred.load.continue20: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP64:%.*]] = phi <8 x i8> [ [[TMP58]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP63]], [[PRED_LOAD_IF19]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP65:%.*]] = extractelement <8 x i1> [[TMP3]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP65]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if21: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP66:%.*]] = extractelement <8 x i32> [[TMP52]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP67:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP66]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP68:%.*]] = load i8, i8* [[TMP67]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP69:%.*]] = insertelement <8 x i8> [[TMP64]], i8 [[TMP68]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE22]] +; DISABLED_MASKED_STRIDED: pred.load.continue22: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP70:%.*]] = phi <8 x i8> [ [[TMP64]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP69]], [[PRED_LOAD_IF21]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP71:%.*]] = extractelement <8 x i1> [[TMP3]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP71]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if23: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP72:%.*]] = extractelement <8 x i32> [[TMP52]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP72]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP74:%.*]] = load i8, i8* [[TMP73]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP75:%.*]] = insertelement <8 x i8> [[TMP70]], i8 [[TMP74]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE24]] +; DISABLED_MASKED_STRIDED: pred.load.continue24: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP76:%.*]] = phi <8 x i8> [ [[TMP70]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP75]], [[PRED_LOAD_IF23]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP77:%.*]] = extractelement <8 x i1> [[TMP3]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP77]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if25: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP78:%.*]] = extractelement <8 x i32> [[TMP52]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP79:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP78]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP80:%.*]] = load i8, i8* [[TMP79]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP81:%.*]] = insertelement <8 x i8> [[TMP76]], i8 [[TMP80]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE26]] +; DISABLED_MASKED_STRIDED: pred.load.continue26: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP82:%.*]] = phi <8 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP81]], [[PRED_LOAD_IF25]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP83:%.*]] = extractelement <8 x i1> [[TMP3]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP83]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if27: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP84:%.*]] = extractelement <8 x i32> [[TMP52]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP85:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP84]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP86:%.*]] = load i8, i8* [[TMP85]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP87:%.*]] = insertelement <8 x i8> [[TMP82]], i8 [[TMP86]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE28]] +; DISABLED_MASKED_STRIDED: pred.load.continue28: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP88:%.*]] = phi <8 x i8> [ [[TMP82]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP87]], [[PRED_LOAD_IF27]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP89:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP89]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if29: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP90:%.*]] = extractelement <8 x i32> [[TMP52]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP91:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP90]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP92:%.*]] = load i8, i8* [[TMP91]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP93:%.*]] = insertelement <8 x i8> [[TMP88]], i8 [[TMP92]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE30]] +; DISABLED_MASKED_STRIDED: pred.load.continue30: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP94:%.*]] = phi <8 x i8> [ [[TMP88]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP93]], [[PRED_LOAD_IF29]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP95:%.*]] = extractelement <8 x i1> [[TMP3]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP95]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if31: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP96:%.*]] = extractelement <8 x i32> [[TMP52]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP97:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP96]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP98:%.*]] = load i8, i8* [[TMP97]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP99:%.*]] = insertelement <8 x i8> [[TMP94]], i8 [[TMP98]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE32]] +; DISABLED_MASKED_STRIDED: pred.load.continue32: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP100:%.*]] = phi <8 x i8> [ [[TMP94]], [[PRED_LOAD_CONTINUE30]] ], [ [[TMP99]], [[PRED_LOAD_IF31]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP101:%.*]] = icmp slt <8 x i8> [[TMP51]], [[TMP100]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP102:%.*]] = select <8 x i1> [[TMP101]], <8 x i8> [[TMP100]], <8 x i8> [[TMP51]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP103:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP103]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP104:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP105:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[TMP104]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP106:%.*]] = extractelement <8 x i8> [[TMP102]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP106]], i8* [[TMP105]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE]] +; DISABLED_MASKED_STRIDED: pred.store.continue: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP107:%.*]] = extractelement <8 x i1> [[TMP3]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP107]], label [[PRED_STORE_IF33:%.*]], label [[PRED_STORE_CONTINUE34:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if33: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP108:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP109:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP108]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP110:%.*]] = extractelement <8 x i8> [[TMP102]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP110]], i8* [[TMP109]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE34]] +; DISABLED_MASKED_STRIDED: pred.store.continue34: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP111:%.*]] = extractelement <8 x i1> [[TMP3]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP111]], label [[PRED_STORE_IF35:%.*]], label [[PRED_STORE_CONTINUE36:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if35: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP112:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP113:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP112]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP114:%.*]] = extractelement <8 x i8> [[TMP102]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP114]], i8* [[TMP113]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE36]] +; DISABLED_MASKED_STRIDED: pred.store.continue36: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP115:%.*]] = extractelement <8 x i1> [[TMP3]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP115]], label [[PRED_STORE_IF37:%.*]], label [[PRED_STORE_CONTINUE38:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if37: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP116:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP117:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP116]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP118:%.*]] = extractelement <8 x i8> [[TMP102]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP118]], i8* [[TMP117]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE38]] +; DISABLED_MASKED_STRIDED: pred.store.continue38: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP119:%.*]] = extractelement <8 x i1> [[TMP3]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP119]], label [[PRED_STORE_IF39:%.*]], label [[PRED_STORE_CONTINUE40:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if39: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP120:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP121:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP120]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP122:%.*]] = extractelement <8 x i8> [[TMP102]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP122]], i8* [[TMP121]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE40]] +; DISABLED_MASKED_STRIDED: pred.store.continue40: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP123:%.*]] = extractelement <8 x i1> [[TMP3]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP123]], label [[PRED_STORE_IF41:%.*]], label [[PRED_STORE_CONTINUE42:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if41: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP124:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP125:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP124]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP126:%.*]] = extractelement <8 x i8> [[TMP102]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP126]], i8* [[TMP125]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE42]] +; DISABLED_MASKED_STRIDED: pred.store.continue42: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP127:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP127]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if43: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP128:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP128]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP130:%.*]] = extractelement <8 x i8> [[TMP102]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP130]], i8* [[TMP129]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE44]] +; DISABLED_MASKED_STRIDED: pred.store.continue44: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP131:%.*]] = extractelement <8 x i1> [[TMP3]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP131]], label [[PRED_STORE_IF45:%.*]], label [[PRED_STORE_CONTINUE46:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if45: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP132:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP133:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP132]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP134:%.*]] = extractelement <8 x i8> [[TMP102]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP134]], i8* [[TMP133]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE46]] +; DISABLED_MASKED_STRIDED: pred.store.continue46: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP135:%.*]] = sub <8 x i8> zeroinitializer, [[TMP102]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP136:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP136]], label [[PRED_STORE_IF47:%.*]], label [[PRED_STORE_CONTINUE48:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if47: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP137:%.*]] = extractelement <8 x i32> [[TMP52]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP138:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP137]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP139:%.*]] = extractelement <8 x i8> [[TMP135]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP139]], i8* [[TMP138]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE48]] +; DISABLED_MASKED_STRIDED: pred.store.continue48: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP140:%.*]] = extractelement <8 x i1> [[TMP3]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP140]], label [[PRED_STORE_IF49:%.*]], label [[PRED_STORE_CONTINUE50:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if49: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP141:%.*]] = extractelement <8 x i32> [[TMP52]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP142:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP141]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP143:%.*]] = extractelement <8 x i8> [[TMP135]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP143]], i8* [[TMP142]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE50]] +; DISABLED_MASKED_STRIDED: pred.store.continue50: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP144:%.*]] = extractelement <8 x i1> [[TMP3]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP144]], label [[PRED_STORE_IF51:%.*]], label [[PRED_STORE_CONTINUE52:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if51: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP145:%.*]] = extractelement <8 x i32> [[TMP52]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP146:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP145]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP147:%.*]] = extractelement <8 x i8> [[TMP135]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP147]], i8* [[TMP146]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE52]] +; DISABLED_MASKED_STRIDED: pred.store.continue52: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP148:%.*]] = extractelement <8 x i1> [[TMP3]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP148]], label [[PRED_STORE_IF53:%.*]], label [[PRED_STORE_CONTINUE54:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if53: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP149:%.*]] = extractelement <8 x i32> [[TMP52]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP150:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP149]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP151:%.*]] = extractelement <8 x i8> [[TMP135]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP151]], i8* [[TMP150]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE54]] +; DISABLED_MASKED_STRIDED: pred.store.continue54: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP152:%.*]] = extractelement <8 x i1> [[TMP3]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP152]], label [[PRED_STORE_IF55:%.*]], label [[PRED_STORE_CONTINUE56:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if55: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP153:%.*]] = extractelement <8 x i32> [[TMP52]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP154:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP153]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP155:%.*]] = extractelement <8 x i8> [[TMP135]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP155]], i8* [[TMP154]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE56]] +; DISABLED_MASKED_STRIDED: pred.store.continue56: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP156:%.*]] = extractelement <8 x i1> [[TMP3]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP156]], label [[PRED_STORE_IF57:%.*]], label [[PRED_STORE_CONTINUE58:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if57: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP157:%.*]] = extractelement <8 x i32> [[TMP52]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP158:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP157]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP159:%.*]] = extractelement <8 x i8> [[TMP135]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP159]], i8* [[TMP158]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE58]] +; DISABLED_MASKED_STRIDED: pred.store.continue58: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP160:%.*]] = extractelement <8 x i1> [[TMP3]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP160]], label [[PRED_STORE_IF59:%.*]], label [[PRED_STORE_CONTINUE60:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if59: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP161:%.*]] = extractelement <8 x i32> [[TMP52]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP162:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP161]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP163:%.*]] = extractelement <8 x i8> [[TMP135]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP163]], i8* [[TMP162]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE60]] +; DISABLED_MASKED_STRIDED: pred.store.continue60: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP164:%.*]] = extractelement <8 x i1> [[TMP3]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP164]], label [[PRED_STORE_IF61:%.*]], label [[PRED_STORE_CONTINUE62]] +; DISABLED_MASKED_STRIDED: pred.store.if61: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP165:%.*]] = extractelement <8 x i32> [[TMP52]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP166:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP165]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP167:%.*]] = extractelement <8 x i8> [[TMP135]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP167]], i8* [[TMP166]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE62]] +; DISABLED_MASKED_STRIDED: pred.store.continue62: +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP168:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP168]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop !8 +; DISABLED_MASKED_STRIDED: for.end: +; DISABLED_MASKED_STRIDED-NEXT: ret void +; ; ENABLED_MASKED_STRIDED-LABEL: @masked_strided2_unknown_tc( +; ENABLED_MASKED_STRIDED-NEXT: entry: +; ENABLED_MASKED_STRIDED-NEXT: [[CMP22:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[CMP22]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]] +; ENABLED_MASKED_STRIDED: vector.ph: +; ENABLED_MASKED_STRIDED-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 +; ENABLED_MASKED_STRIDED-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 +; ENABLED_MASKED_STRIDED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[GUARD:%.*]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer +; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; ENABLED_MASKED_STRIDED: vector.body: -; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 -; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ -; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp sgt <8 x i32> [[VEC_IND]], {{.*}} +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp sgt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}} +; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <16 x i32> @@ -667,14 +1884,13 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP11]] to <16 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> [[TMP9]], <16 x i32> ; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP12]], i32 1, <16 x i1> [[INTERLEAVED_MASK]]) -; ENABLED_MASKED_STRIDED-NEXT: {{.*}} = add i32 [[INDEX]], 8 -; ENABLED_MASKED_STRIDED-NEXT: {{.*}} = add <8 x i32> {{.*}}, -; ENABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = icmp eq i32 {{.*}}, {{.*}} -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP13]], +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP13]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop !10 ; ENABLED_MASKED_STRIDED: for.end: ; ENABLED_MASKED_STRIDED-NEXT: ret void - -define dso_local void @masked_strided2_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %guard, i32 %n) local_unnamed_addr optsize { +; entry: %cmp22 = icmp sgt i32 %n, 0 br i1 %cmp22, label %for.body.preheader, label %for.end @@ -740,13 +1956,24 @@ ; q[2*ix+1] = 0 - max; ; } ;} - +; +define dso_local void @unconditional_masked_strided2_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %n) local_unnamed_addr optsize { ; DISABLED_MASKED_STRIDED-LABEL: @unconditional_masked_strided2_unknown_tc( +; DISABLED_MASKED_STRIDED-NEXT: entry: +; DISABLED_MASKED_STRIDED-NEXT: [[CMP20:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[CMP20]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]] +; DISABLED_MASKED_STRIDED: vector.ph: +; DISABLED_MASKED_STRIDED-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 +; DISABLED_MASKED_STRIDED-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 +; DISABLED_MASKED_STRIDED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 +; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; DISABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; DISABLED_MASKED_STRIDED: vector.body: -; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 -; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE60:%.*]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE60]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], -; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}} +; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0 ; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; DISABLED_MASKED_STRIDED: pred.load.if: @@ -755,23 +1982,331 @@ ; DISABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1 ; DISABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = insertelement <8 x i8> undef, i8 [[TMP5]], i32 0 ; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE]] -; DISABLED_MASKED_STRIDED-NOT: for.body: +; DISABLED_MASKED_STRIDED: pred.load.continue: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = phi <8 x i8> [ undef, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_LOAD_IF]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP8]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if1: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP0]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP9]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = insertelement <8 x i8> [[TMP7]], i8 [[TMP11]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; DISABLED_MASKED_STRIDED: pred.load.continue2: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = phi <8 x i8> [ [[TMP7]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF1]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP14:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP14]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if3: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[TMP0]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP15]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP17:%.*]] = load i8, i8* [[TMP16]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP18:%.*]] = insertelement <8 x i8> [[TMP13]], i8 [[TMP17]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; DISABLED_MASKED_STRIDED: pred.load.continue4: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP19:%.*]] = phi <8 x i8> [ [[TMP13]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP18]], [[PRED_LOAD_IF3]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP20:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP20]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if5: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP21:%.*]] = extractelement <8 x i32> [[TMP0]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP21]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP23:%.*]] = load i8, i8* [[TMP22]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP24:%.*]] = insertelement <8 x i8> [[TMP19]], i8 [[TMP23]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; DISABLED_MASKED_STRIDED: pred.load.continue6: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP25:%.*]] = phi <8 x i8> [ [[TMP19]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP26:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP26]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if7: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP27:%.*]] = extractelement <8 x i32> [[TMP0]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP27]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP29:%.*]] = load i8, i8* [[TMP28]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP30:%.*]] = insertelement <8 x i8> [[TMP25]], i8 [[TMP29]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE8]] +; DISABLED_MASKED_STRIDED: pred.load.continue8: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP31:%.*]] = phi <8 x i8> [ [[TMP25]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP30]], [[PRED_LOAD_IF7]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP32:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if9: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP33:%.*]] = extractelement <8 x i32> [[TMP0]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP33]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP35:%.*]] = load i8, i8* [[TMP34]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP36:%.*]] = insertelement <8 x i8> [[TMP31]], i8 [[TMP35]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE10]] +; DISABLED_MASKED_STRIDED: pred.load.continue10: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP37:%.*]] = phi <8 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP36]], [[PRED_LOAD_IF9]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP38:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP38]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if11: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP39:%.*]] = extractelement <8 x i32> [[TMP0]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP39]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP41:%.*]] = load i8, i8* [[TMP40]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP42:%.*]] = insertelement <8 x i8> [[TMP37]], i8 [[TMP41]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE12]] +; DISABLED_MASKED_STRIDED: pred.load.continue12: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP43:%.*]] = phi <8 x i8> [ [[TMP37]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP42]], [[PRED_LOAD_IF11]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP44:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP44]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if13: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP45:%.*]] = extractelement <8 x i32> [[TMP0]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP45]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP47:%.*]] = load i8, i8* [[TMP46]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP48:%.*]] = insertelement <8 x i8> [[TMP43]], i8 [[TMP47]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE14]] +; DISABLED_MASKED_STRIDED: pred.load.continue14: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP49:%.*]] = phi <8 x i8> [ [[TMP43]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP48]], [[PRED_LOAD_IF13]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP50:%.*]] = or <8 x i32> [[TMP0]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP51:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP51]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if15: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP52:%.*]] = extractelement <8 x i32> [[TMP50]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP52]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP54:%.*]] = load i8, i8* [[TMP53]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP55:%.*]] = insertelement <8 x i8> undef, i8 [[TMP54]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE16]] +; DISABLED_MASKED_STRIDED: pred.load.continue16: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP56:%.*]] = phi <8 x i8> [ undef, [[PRED_LOAD_CONTINUE14]] ], [ [[TMP55]], [[PRED_LOAD_IF15]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP57:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP57]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if17: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP58:%.*]] = extractelement <8 x i32> [[TMP50]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP59:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP58]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP60:%.*]] = load i8, i8* [[TMP59]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP61:%.*]] = insertelement <8 x i8> [[TMP56]], i8 [[TMP60]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE18]] +; DISABLED_MASKED_STRIDED: pred.load.continue18: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP62:%.*]] = phi <8 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP61]], [[PRED_LOAD_IF17]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP63:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP63]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if19: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP64:%.*]] = extractelement <8 x i32> [[TMP50]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP65:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP64]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP66:%.*]] = load i8, i8* [[TMP65]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP67:%.*]] = insertelement <8 x i8> [[TMP62]], i8 [[TMP66]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE20]] +; DISABLED_MASKED_STRIDED: pred.load.continue20: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP68:%.*]] = phi <8 x i8> [ [[TMP62]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP67]], [[PRED_LOAD_IF19]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP69:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP69]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if21: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP70:%.*]] = extractelement <8 x i32> [[TMP50]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP71:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP70]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP72:%.*]] = load i8, i8* [[TMP71]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP73:%.*]] = insertelement <8 x i8> [[TMP68]], i8 [[TMP72]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE22]] +; DISABLED_MASKED_STRIDED: pred.load.continue22: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP74:%.*]] = phi <8 x i8> [ [[TMP68]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP73]], [[PRED_LOAD_IF21]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP75:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP75]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if23: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP76:%.*]] = extractelement <8 x i32> [[TMP50]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP77:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP76]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP78:%.*]] = load i8, i8* [[TMP77]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP79:%.*]] = insertelement <8 x i8> [[TMP74]], i8 [[TMP78]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE24]] +; DISABLED_MASKED_STRIDED: pred.load.continue24: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP80:%.*]] = phi <8 x i8> [ [[TMP74]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP79]], [[PRED_LOAD_IF23]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP81:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP81]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if25: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP82:%.*]] = extractelement <8 x i32> [[TMP50]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP82]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP84:%.*]] = load i8, i8* [[TMP83]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP85:%.*]] = insertelement <8 x i8> [[TMP80]], i8 [[TMP84]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE26]] +; DISABLED_MASKED_STRIDED: pred.load.continue26: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP86:%.*]] = phi <8 x i8> [ [[TMP80]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP87:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if27: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP88:%.*]] = extractelement <8 x i32> [[TMP50]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP89:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP88]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP90:%.*]] = load i8, i8* [[TMP89]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP91:%.*]] = insertelement <8 x i8> [[TMP86]], i8 [[TMP90]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE28]] +; DISABLED_MASKED_STRIDED: pred.load.continue28: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP92:%.*]] = phi <8 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP91]], [[PRED_LOAD_IF27]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP93:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP93]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] +; DISABLED_MASKED_STRIDED: pred.load.if29: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP94:%.*]] = extractelement <8 x i32> [[TMP50]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP95:%.*]] = getelementptr inbounds i8, i8* [[P]], i32 [[TMP94]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP96:%.*]] = load i8, i8* [[TMP95]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP97:%.*]] = insertelement <8 x i8> [[TMP92]], i8 [[TMP96]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_LOAD_CONTINUE30]] +; DISABLED_MASKED_STRIDED: pred.load.continue30: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP98:%.*]] = phi <8 x i8> [ [[TMP92]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP97]], [[PRED_LOAD_IF29]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP99:%.*]] = icmp slt <8 x i8> [[TMP49]], [[TMP98]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP100:%.*]] = select <8 x i1> [[TMP99]], <8 x i8> [[TMP98]], <8 x i8> [[TMP49]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP101:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP101]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP102:%.*]] = extractelement <8 x i32> [[TMP0]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP103:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[TMP102]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP104:%.*]] = extractelement <8 x i8> [[TMP100]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP104]], i8* [[TMP103]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE]] +; DISABLED_MASKED_STRIDED: pred.store.continue: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP105:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP105]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if31: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP106:%.*]] = extractelement <8 x i32> [[TMP0]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP107:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP106]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP108:%.*]] = extractelement <8 x i8> [[TMP100]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP108]], i8* [[TMP107]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE32]] +; DISABLED_MASKED_STRIDED: pred.store.continue32: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP109:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP109]], label [[PRED_STORE_IF33:%.*]], label [[PRED_STORE_CONTINUE34:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if33: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP110:%.*]] = extractelement <8 x i32> [[TMP0]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP111:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP110]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP112:%.*]] = extractelement <8 x i8> [[TMP100]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP112]], i8* [[TMP111]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE34]] +; DISABLED_MASKED_STRIDED: pred.store.continue34: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP113:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP113]], label [[PRED_STORE_IF35:%.*]], label [[PRED_STORE_CONTINUE36:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if35: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP114:%.*]] = extractelement <8 x i32> [[TMP0]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP115:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP114]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP116:%.*]] = extractelement <8 x i8> [[TMP100]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP116]], i8* [[TMP115]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE36]] +; DISABLED_MASKED_STRIDED: pred.store.continue36: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP117:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP117]], label [[PRED_STORE_IF37:%.*]], label [[PRED_STORE_CONTINUE38:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if37: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP118:%.*]] = extractelement <8 x i32> [[TMP0]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP118]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP120:%.*]] = extractelement <8 x i8> [[TMP100]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP120]], i8* [[TMP119]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE38]] +; DISABLED_MASKED_STRIDED: pred.store.continue38: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP121:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP121]], label [[PRED_STORE_IF39:%.*]], label [[PRED_STORE_CONTINUE40:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if39: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP122:%.*]] = extractelement <8 x i32> [[TMP0]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP123:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP122]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP124:%.*]] = extractelement <8 x i8> [[TMP100]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP124]], i8* [[TMP123]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE40]] +; DISABLED_MASKED_STRIDED: pred.store.continue40: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP125:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP125]], label [[PRED_STORE_IF41:%.*]], label [[PRED_STORE_CONTINUE42:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if41: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP126:%.*]] = extractelement <8 x i32> [[TMP0]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP127:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP126]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP128:%.*]] = extractelement <8 x i8> [[TMP100]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP128]], i8* [[TMP127]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE42]] +; DISABLED_MASKED_STRIDED: pred.store.continue42: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP129:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP129]], label [[PRED_STORE_IF43:%.*]], label [[PRED_STORE_CONTINUE44:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if43: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP130:%.*]] = extractelement <8 x i32> [[TMP0]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP131:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP130]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP132:%.*]] = extractelement <8 x i8> [[TMP100]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP132]], i8* [[TMP131]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE44]] +; DISABLED_MASKED_STRIDED: pred.store.continue44: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP133:%.*]] = sub <8 x i8> zeroinitializer, [[TMP100]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP134:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP134]], label [[PRED_STORE_IF45:%.*]], label [[PRED_STORE_CONTINUE46:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if45: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP135:%.*]] = extractelement <8 x i32> [[TMP50]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP136:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP135]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP137:%.*]] = extractelement <8 x i8> [[TMP133]], i32 0 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP137]], i8* [[TMP136]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE46]] +; DISABLED_MASKED_STRIDED: pred.store.continue46: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP138:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP138]], label [[PRED_STORE_IF47:%.*]], label [[PRED_STORE_CONTINUE48:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if47: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP139:%.*]] = extractelement <8 x i32> [[TMP50]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP140:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP139]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP141:%.*]] = extractelement <8 x i8> [[TMP133]], i32 1 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP141]], i8* [[TMP140]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE48]] +; DISABLED_MASKED_STRIDED: pred.store.continue48: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP142:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP142]], label [[PRED_STORE_IF49:%.*]], label [[PRED_STORE_CONTINUE50:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if49: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP143:%.*]] = extractelement <8 x i32> [[TMP50]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP143]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP145:%.*]] = extractelement <8 x i8> [[TMP133]], i32 2 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP145]], i8* [[TMP144]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE50]] +; DISABLED_MASKED_STRIDED: pred.store.continue50: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP146:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP146]], label [[PRED_STORE_IF51:%.*]], label [[PRED_STORE_CONTINUE52:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if51: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP147:%.*]] = extractelement <8 x i32> [[TMP50]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP148:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP147]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP149:%.*]] = extractelement <8 x i8> [[TMP133]], i32 3 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP149]], i8* [[TMP148]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE52]] +; DISABLED_MASKED_STRIDED: pred.store.continue52: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP150:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP150]], label [[PRED_STORE_IF53:%.*]], label [[PRED_STORE_CONTINUE54:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if53: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP151:%.*]] = extractelement <8 x i32> [[TMP50]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP152:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP151]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP153:%.*]] = extractelement <8 x i8> [[TMP133]], i32 4 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP153]], i8* [[TMP152]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE54]] +; DISABLED_MASKED_STRIDED: pred.store.continue54: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP154:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP154]], label [[PRED_STORE_IF55:%.*]], label [[PRED_STORE_CONTINUE56:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if55: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP155:%.*]] = extractelement <8 x i32> [[TMP50]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP156:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP155]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP157:%.*]] = extractelement <8 x i8> [[TMP133]], i32 5 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP157]], i8* [[TMP156]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE56]] +; DISABLED_MASKED_STRIDED: pred.store.continue56: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP158:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP158]], label [[PRED_STORE_IF57:%.*]], label [[PRED_STORE_CONTINUE58:%.*]] +; DISABLED_MASKED_STRIDED: pred.store.if57: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP159:%.*]] = extractelement <8 x i32> [[TMP50]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP160:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP159]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP161:%.*]] = extractelement <8 x i8> [[TMP133]], i32 6 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP161]], i8* [[TMP160]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE58]] +; DISABLED_MASKED_STRIDED: pred.store.continue58: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP162:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP162]], label [[PRED_STORE_IF59:%.*]], label [[PRED_STORE_CONTINUE60]] +; DISABLED_MASKED_STRIDED: pred.store.if59: +; DISABLED_MASKED_STRIDED-NEXT: [[TMP163:%.*]] = extractelement <8 x i32> [[TMP50]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, i8* [[Q]], i32 [[TMP163]] +; DISABLED_MASKED_STRIDED-NEXT: [[TMP165:%.*]] = extractelement <8 x i8> [[TMP133]], i32 7 +; DISABLED_MASKED_STRIDED-NEXT: store i8 [[TMP165]], i8* [[TMP164]], align 1 +; DISABLED_MASKED_STRIDED-NEXT: br label [[PRED_STORE_CONTINUE60]] +; DISABLED_MASKED_STRIDED: pred.store.continue60: +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], +; DISABLED_MASKED_STRIDED-NEXT: [[TMP166:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[TMP166]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop !9 ; DISABLED_MASKED_STRIDED: for.end: ; DISABLED_MASKED_STRIDED-NEXT: ret void - - - +; ; ENABLED_MASKED_STRIDED-LABEL: @unconditional_masked_strided2_unknown_tc( +; ENABLED_MASKED_STRIDED-NEXT: entry: +; ENABLED_MASKED_STRIDED-NEXT: [[CMP20:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[CMP20]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]] +; ENABLED_MASKED_STRIDED: vector.ph: +; ENABLED_MASKED_STRIDED-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 +; ENABLED_MASKED_STRIDED-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 +; ENABLED_MASKED_STRIDED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1 +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; ENABLED_MASKED_STRIDED: vector.body: -; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 -; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[INDEX]], i32 0 -; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer -; ENABLED_MASKED_STRIDED-NEXT: [[INDUCTION:%.*]] = or <8 x i32> [[BROADCAST_SPLAT]], +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[INDEX]], i32 0 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1 ; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = icmp ule <8 x i32> {{.*}}, {{.*}} +; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = icmp ule <8 x i32> [[BROADCAST_SPLATINSERT1]], [[BROADCAST_SPLATINSERT]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>* -; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <16 x i32> +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <16 x i32> zeroinitializer ; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> undef) ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> @@ -784,13 +2319,12 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>* ; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <16 x i32> ; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP10]], i32 1, <16 x i1> [[INTERLEAVED_MASK]]) -; ENABLED_MASKED_STRIDED-NEXT: {{.*}} = add i32 [[INDEX]], 8 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = icmp eq i32 {{.*}}, {{.*}} -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP11]] +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP11]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop !11 ; ENABLED_MASKED_STRIDED: for.end: ; ENABLED_MASKED_STRIDED-NEXT: ret void - -define dso_local void @unconditional_masked_strided2_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %n) local_unnamed_addr optsize { +; entry: %cmp20 = icmp sgt i32 %n, 0 br i1 %cmp20, label %for.body.preheader, label %for.end Index: llvm/test/Transforms/LoopVectorize/fcmp-vectorize.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/fcmp-vectorize.ll +++ llvm/test/Transforms/LoopVectorize/fcmp-vectorize.ll @@ -6,8 +6,6 @@ ; CHECK-LABEL: test1( ; CHECK-LABEL: vector.body: ; CHECK-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] -; CHECK-NEXT: %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 -; CHECK: %induction = add <4 x i32> %broadcast.splat, ; CHECK: %index.next = add i32 %index, 4 entry: Index: llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll +++ llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll @@ -8,8 +8,8 @@ define void @can_sink_after_store(i32 %x, i32* %ptr, i64 %tc) local_unnamed_addr #0 { ; CHECK-LABEL: vector.ph: -; CHECK: %broadcast.splatinsert1 = insertelement <4 x i32> undef, i32 %x, i32 0 -; CHECK-NEXT: %broadcast.splat2 = shufflevector <4 x i32> %broadcast.splatinsert1, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %x, i32 0 +; CHECK-NEXT: %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: %vector.recur.init = insertelement <4 x i32> undef, i32 %.pre, i32 3 ; CHECK-NEXT: br label %vector.body @@ -17,16 +17,13 @@ ; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; CHECK-NEXT: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ %wide.load, %vector.body ] ; CHECK-NEXT: %offset.idx = add i64 1, %index -; CHECK-NEXT: %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %offset.idx, i32 0 -; CHECK-NEXT: %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: %induction = add <4 x i64> %broadcast.splat, ; CHECK-NEXT: %0 = add i64 %offset.idx, 0 ; CHECK-NEXT: %1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %0 ; CHECK-NEXT: %2 = getelementptr inbounds i32, i32* %1, i32 0 ; CHECK-NEXT: %3 = bitcast i32* %2 to <4 x i32>* ; CHECK-NEXT: %wide.load = load <4 x i32>, <4 x i32>* %3, align 4 ; CHECK-NEXT: %4 = shufflevector <4 x i32> %vector.recur, <4 x i32> %wide.load, <4 x i32> -; CHECK-NEXT: %5 = add <4 x i32> %4, %broadcast.splat2 +; CHECK-NEXT: %5 = add <4 x i32> %4, %broadcast.splat ; CHECK-NEXT: %6 = add <4 x i32> %5, %wide.load ; CHECK-NEXT: %7 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %0 ; CHECK-NEXT: %8 = getelementptr inbounds i32, i32* %7, i32 0 @@ -65,8 +62,8 @@ ; and not introduce traps on additional paths. define void @sink_sdiv(i32 %x, i32* %ptr, i64 %tc) local_unnamed_addr #0 { ; CHECK-LABEL: vector.ph: -; CHECK: %broadcast.splatinsert1 = insertelement <4 x i32> undef, i32 %x, i32 0 -; CHECK-NEXT: %broadcast.splat2 = shufflevector <4 x i32> %broadcast.splatinsert1, <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK: %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %x, i32 0 +; CHECK-NEXT: %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: %vector.recur.init = insertelement <4 x i32> undef, i32 %.pre, i32 3 ; CHECK-NEXT: br label %vector.body @@ -74,16 +71,13 @@ ; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; CHECK-NEXT: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ %wide.load, %vector.body ] ; CHECK-NEXT: %offset.idx = add i64 1, %index -; CHECK-NEXT: %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %offset.idx, i32 0 -; CHECK-NEXT: %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: %induction = add <4 x i64> %broadcast.splat, ; CHECK-NEXT: %0 = add i64 %offset.idx, 0 ; CHECK-NEXT: %1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %0 ; CHECK-NEXT: %2 = getelementptr inbounds i32, i32* %1, i32 0 ; CHECK-NEXT: %3 = bitcast i32* %2 to <4 x i32>* ; CHECK-NEXT: %wide.load = load <4 x i32>, <4 x i32>* %3, align 4 ; CHECK-NEXT: %4 = shufflevector <4 x i32> %vector.recur, <4 x i32> %wide.load, <4 x i32> -; CHECK-NEXT: %5 = sdiv <4 x i32> %4, %broadcast.splat2 +; CHECK-NEXT: %5 = sdiv <4 x i32> %4, %broadcast.splat ; CHECK-NEXT: %6 = add <4 x i32> %5, %wide.load ; CHECK-NEXT: %7 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %0 ; CHECK-NEXT: %8 = getelementptr inbounds i32, i32* %7, i32 0 Index: llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll +++ llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll @@ -57,9 +57,6 @@ ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[OFFSET_IDX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr float, float* [[ARG]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, float* [[TMP1]], i32 0 Index: llvm/test/Transforms/LoopVectorize/if-pred-stores.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/if-pred-stores.ll +++ llvm/test/Transforms/LoopVectorize/if-pred-stores.ll @@ -113,9 +113,6 @@ ; VEC-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC: vector.body: ; VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] -; VEC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> undef, i64 [[INDEX]], i32 0 -; VEC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> undef, <2 x i32> zeroinitializer -; VEC-NEXT: [[INDUCTION:%.*]] = add <2 x i64> [[BROADCAST_SPLAT]], ; VEC-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; VEC-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[F:%.*]], i64 [[TMP0]] ; VEC-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 @@ -458,25 +455,19 @@ ; ; VEC-LABEL: @minimal_bit_widths( ; VEC-NEXT: entry: -; VEC-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <2 x i1> undef, i1 [[C:%.*]], i32 0 -; VEC-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT5]], <2 x i1> undef, <2 x i32> zeroinitializer +; VEC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> undef, i1 [[C:%.*]], i32 0 +; VEC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> undef, <2 x i32> zeroinitializer ; VEC-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC: vector.body: -; VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ] -; VEC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> undef, i64 [[INDEX]], i32 0 -; VEC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> undef, <2 x i32> zeroinitializer -; VEC-NEXT: [[INDUCTION:%.*]] = add <2 x i64> [[BROADCAST_SPLAT]], +; VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] ; VEC-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; VEC-NEXT: [[OFFSET_IDX:%.*]] = sub i64 undef, [[INDEX]] -; VEC-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> undef, i64 [[OFFSET_IDX]], i32 0 -; VEC-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> undef, <2 x i32> zeroinitializer -; VEC-NEXT: [[INDUCTION4:%.*]] = add <2 x i64> [[BROADCAST_SPLAT3]], ; VEC-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 0 ; VEC-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* undef, i64 [[TMP0]] ; VEC-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP2]], i32 0 ; VEC-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <2 x i8>* ; VEC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, <2 x i8>* [[TMP4]], align 1 -; VEC-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT6]], i32 0 +; VEC-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 ; VEC-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; VEC: pred.store.if: ; VEC-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 0 @@ -485,17 +476,17 @@ ; VEC-NEXT: store i8 [[TMP8]], i8* [[TMP2]], align 1 ; VEC-NEXT: br label [[PRED_STORE_CONTINUE]] ; VEC: pred.store.continue: -; VEC-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT6]], i32 1 -; VEC-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]] -; VEC: pred.store.if7: +; VEC-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1 +; VEC-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] +; VEC: pred.store.if2: ; VEC-NEXT: [[TMP10:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 1 ; VEC-NEXT: [[TMP11:%.*]] = zext i8 [[TMP10]] to i32 ; VEC-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i8 ; VEC-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 1 ; VEC-NEXT: [[TMP14:%.*]] = getelementptr i8, i8* undef, i64 [[TMP13]] ; VEC-NEXT: store i8 [[TMP12]], i8* [[TMP14]], align 1 -; VEC-NEXT: br label [[PRED_STORE_CONTINUE8]] -; VEC: pred.store.continue8: +; VEC-NEXT: br label [[PRED_STORE_CONTINUE3]] +; VEC: pred.store.continue3: ; VEC-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; VEC-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef ; VEC-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 Index: llvm/test/Transforms/LoopVectorize/if-reduction.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/if-reduction.ll +++ llvm/test/Transforms/LoopVectorize/if-reduction.ll @@ -100,7 +100,7 @@ ; } ; CHECK-LABEL: @fcmp_val_fadd_select1( -; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], %broadcast.splat2 +; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], %broadcast.splat ; CHECK: %[[V3:.*]] = fadd fast <4 x float> %[[V0]], %[[V2:.*]] ; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]] define float @fcmp_val_fadd_select1(float* noalias %x, float %y, i32 %N) nounwind readonly { @@ -143,7 +143,7 @@ ; } ; CHECK-LABEL: @fcmp_val_fadd_select2( -; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], %broadcast.splat2 +; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], %broadcast.splat ; CHECK: %[[V3:.*]] = fadd fast <4 x double> %[[V0]], %[[V2:.*]] ; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]] define double @fcmp_val_fadd_select2(double* noalias %x, double %y, i32 %N) nounwind readonly { Index: llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll +++ llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-vectorize -force-vector-width=4 -S < %s | FileCheck %s ; This is the test case from PR26314. @@ -22,16 +23,95 @@ ; } ; } -; CHECK-LABEL: Test -; CHECK: <4 x i64> -; CHECK: <4 x i32>, <4 x i32> -; CHECK: !{!"llvm.loop.isvectorized", i32 1} - target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" %struct.s = type { [32 x i32], [32 x i32], [32 x [32 x i32]] } define void @Test(%struct.s* nocapture %obj, i64 %z) #0 { +; CHECK-LABEL: @Test( +; CHECK-NEXT: [[OBJ4:%.*]] = bitcast %struct.s* [[OBJ:%.*]] to i8* +; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr [[STRUCT_S:%.*]], %struct.s* [[OBJ]], i64 0, i32 0, i64 [[Z:%.*]] +; CHECK-NEXT: [[SCEVGEP56:%.*]] = bitcast i32* [[SCEVGEP5]] to i8* +; CHECK-NEXT: br label [[DOTOUTER_PREHEADER:%.*]] +; CHECK: .outer.preheader: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[I_NEXT:%.*]], [[DOTOUTER:%.*]] ] +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 2, i64 [[I]], i64 0 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = bitcast i32* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[Z]] +; CHECK-NEXT: [[SCEVGEP23:%.*]] = bitcast i32* [[SCEVGEP2]] to i8* +; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 1, i64 [[I]] +; CHECK-NEXT: [[SCEVGEP78:%.*]] = bitcast i32* [[SCEVGEP7]] to i8* +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[SCEVGEP78]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 1, i64 [[I]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[Z]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP1]], [[SCEVGEP56]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[OBJ4]], [[SCEVGEP23]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BC:%.*]] = bitcast i32* [[TMP1]] to i8* +; CHECK-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[SCEVGEP1]], [[UGLYGEP]] +; CHECK-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[BC]], [[SCEVGEP23]] +; CHECK-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true +; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[Z]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[Z]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 0, i64 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4, !alias.scope !0 +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP1]], !alias.scope !3 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP1]], !alias.scope !3 +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP1]], !alias.scope !3 +; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP1]], !alias.scope !3 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3 +; CHECK-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP17]], align 4, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[TMP18:%.*]] = add nsw <4 x i32> [[TMP14]], [[WIDE_LOAD12]] +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP18]], <4 x i32>* [[TMP19]], align 4, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8 +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[Z]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[DOTOUTER]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[DOTOUTER_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[DOTINNER:%.*]] +; CHECK: .exit: +; CHECK-NEXT: ret void +; CHECK: .outer: +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[EXITCOND_OUTER:%.*]] = icmp eq i64 [[I_NEXT]], 32 +; CHECK-NEXT: br i1 [[EXITCOND_OUTER]], label [[DOTEXIT:%.*]], label [[DOTOUTER_PREHEADER]] +; CHECK: .inner: +; CHECK-NEXT: [[J:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[J_NEXT:%.*]], [[DOTINNER]] ] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 0, i64 [[J]] +; CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP1]] +; CHECK-NEXT: [[TMP24:%.*]] = add nsw i32 [[TMP23]], [[TMP22]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.s* [[OBJ]], i64 0, i32 2, i64 [[I]], i64 [[J]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = add nsw i32 [[TMP24]], [[TMP26]] +; CHECK-NEXT: store i32 [[TMP27]], i32* [[TMP25]] +; CHECK-NEXT: [[J_NEXT]] = add nuw nsw i64 [[J]], 1 +; CHECK-NEXT: [[EXITCOND_INNER:%.*]] = icmp eq i64 [[J_NEXT]], [[Z]] +; CHECK-NEXT: br i1 [[EXITCOND_INNER]], label [[DOTOUTER]], label [[DOTINNER]], !llvm.loop !10 +; br label %.outer.preheader @@ -42,7 +122,7 @@ .exit: ret void - + .outer: %i.next = add nuw nsw i64 %i, 1 %exitcond.outer = icmp eq i64 %i.next, 32 @@ -57,7 +137,7 @@ %6 = getelementptr inbounds %struct.s, %struct.s* %obj, i64 0, i32 2, i64 %i, i64 %j %7 = load i32, i32* %6 %8 = add nsw i32 %5, %7 - store i32 %8, i32* %6 + store i32 %8, i32* %6 %j.next = add nuw nsw i64 %j, 1 %exitcond.inner = icmp eq i64 %j.next, %z br i1 %exitcond.inner, label %.outer, label %.inner Index: llvm/test/Transforms/LoopVectorize/pr35773.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/pr35773.ll +++ llvm/test/Transforms/LoopVectorize/pr35773.ll @@ -31,9 +31,6 @@ ; CHECK-NEXT: [[I32_IV:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[I32_IV_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[IV_FROM_TRUNC:%.*]] = phi <4 x i8> [ , [[VECTOR_PH]] ], [ [[IV_FROM_TRUNC_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[MAIN_IV]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[MAIN_IV]], 0 ; CHECK-NEXT: [[I8_IV_NEXT]] = add <4 x i8> [[I8_IV]], [[IV_FROM_TRUNC]] Index: llvm/test/Transforms/LoopVectorize/pr44488-predication.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/pr44488-predication.ll +++ llvm/test/Transforms/LoopVectorize/pr44488-predication.ll @@ -13,9 +13,6 @@ ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], [[PRED_SREM_CONTINUE2:%.*]] ] ; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[INDEX]] to i16 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i16 99, [[TMP0]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> undef, i16 [[OFFSET_IDX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = load i16, i16* @v_38, align 1 ; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* @v_38, align 1 Index: llvm/test/Transforms/LoopVectorize/vector-intrinsic-call-cost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/vector-intrinsic-call-cost.ll +++ llvm/test/Transforms/LoopVectorize/vector-intrinsic-call-cost.ll @@ -3,9 +3,6 @@ ; CHECK-LABEL: @test_fshl ; CHECK-LABEL: vector.body: ; CHECK-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] -; CHECK-NEXT: %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 -; CHECK-NEXT: %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: %induction = add <4 x i32> %broadcast.splat, ; CHECK-NEXT: %0 = add i32 %index, 0 ; CHECK-NEXT: %1 = call <4 x i16> @llvm.fshl.v4i16(<4 x i16> undef, <4 x i16> undef, <4 x i16> ) ; CHECK-NEXT: %index.next = add i32 %index, 4