Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6768,12 +6768,19 @@ // we might create due to scalarization. Cost += getScalarizationOverhead(I, VF); - // If we have a predicated store, it may not be executed for each vector - // lane. Scale the cost by the probability of executing the predicated - // block. + // If we have a predicated load/store, it will need extra i1 extracts and + // conditional branches, but may not be executed for each vector lane. Scale + // the cost by the probability of executing the predicated block. if (isPredicatedInst(I)) { Cost /= getReciprocalPredBlockProb(); + // Add the cost of an i1 extract and a branch + auto *Vec_i1Ty = + VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); + Cost += TTI.getScalarizationOverhead( + Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); + Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); + if (useEmulatedMaskMemRefHack(I)) // Artificially setting to a high enough value to practically disable // vectorization with such operations. Index: llvm/test/Transforms/LoopVectorize/ARM/mve-predstorecost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/mve-predstorecost.ll +++ llvm/test/Transforms/LoopVectorize/ARM/mve-predstorecost.ll @@ -14,24 +14,6 @@ ; CHECK-NEXT: [[CMP66:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP66]], label [[FOR_BODY4_LR_PH_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.body4.lr.ph.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[N]], [[TMP0]] -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[PG:%.*]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = icmp ult float* [[PG]], [[SCEVGEP]] -; CHECK-NEXT: [[UMIN:%.*]] = select i1 [[TMP2]], float* [[PG]], float* [[SCEVGEP]] -; CHECK-NEXT: [[UMIN2:%.*]] = bitcast float* [[UMIN]] to i8* -; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt float* [[PG]], [[SCEVGEP]] -; CHECK-NEXT: [[UMAX:%.*]] = select i1 [[TMP3]], float* [[PG]], float* [[SCEVGEP]] -; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr float, float* [[UMAX]], i32 1 -; CHECK-NEXT: [[SCEVGEP34:%.*]] = bitcast float* [[SCEVGEP3]] to i8* -; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr float, float* [[PA:%.*]], i32 [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = icmp ult float* [[PA]], [[SCEVGEP5]] -; CHECK-NEXT: [[UMIN6:%.*]] = select i1 [[TMP4]], float* [[PA]], float* [[SCEVGEP5]] -; CHECK-NEXT: [[UMIN67:%.*]] = bitcast float* [[UMIN6]] to i8* -; CHECK-NEXT: [[TMP5:%.*]] = icmp ugt float* [[PA]], [[SCEVGEP5]] -; CHECK-NEXT: [[UMAX8:%.*]] = select i1 [[TMP5]], float* [[PA]], float* [[SCEVGEP5]] -; CHECK-NEXT: [[SCEVGEP9:%.*]] = getelementptr float, float* [[UMAX8]], i32 1 -; CHECK-NEXT: [[SCEVGEP910:%.*]] = bitcast float* [[SCEVGEP9]] to i8* ; CHECK-NEXT: br label [[FOR_BODY4_LR_PH:%.*]] ; CHECK: for.body4.lr.ph: ; CHECK-NEXT: [[I_067:%.*]] = phi i32 [ [[INC29:%.*]], [[FOR_COND_CLEANUP3:%.*]] ], [ 0, [[FOR_BODY4_LR_PH_PREHEADER]] ] @@ -41,93 +23,27 @@ ; CHECK: for.body4.us.preheader: ; CHECK-NEXT: br label [[FOR_BODY4_US:%.*]] ; CHECK: for.body4.preheader: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] -; CHECK: vector.memcheck: -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[UMIN2]], [[SCEVGEP910]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[UMIN67]], [[SCEVGEP34]] -; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true -; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE16:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE16]] ] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP6]], i32 [[N]]) -; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[PA]], <4 x i32> [[TMP7]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP8]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> undef), !alias.scope !0 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[PG]], i32 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[WIDE_MASKED_GATHER]], i32 0 -; CHECK-NEXT: store float [[TMP12]], float* [[TMP11]], align 4, !alias.scope !3, !noalias !0 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] -; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1 -; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]] -; CHECK: pred.store.if11: -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP7]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[PG]], i32 [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[WIDE_MASKED_GATHER]], i32 1 -; CHECK-NEXT: store float [[TMP16]], float* [[TMP15]], align 4, !alias.scope !3, !noalias !0 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE12]] -; CHECK: pred.store.continue12: -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2 -; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]] -; CHECK: pred.store.if13: -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i32> [[TMP7]], i32 2 -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[PG]], i32 [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[WIDE_MASKED_GATHER]], i32 2 -; CHECK-NEXT: store float [[TMP20]], float* [[TMP19]], align 4, !alias.scope !3, !noalias !0 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE14]] -; CHECK: pred.store.continue14: -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3 -; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16]] -; CHECK: pred.store.if15: -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP7]], i32 3 -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[PG]], i32 [[TMP22]] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x float> [[WIDE_MASKED_GATHER]], i32 3 -; CHECK-NEXT: store float [[TMP24]], float* [[TMP23]], align 4, !alias.scope !3, !noalias !0 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE16]] -; CHECK: pred.store.continue16: -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]] -; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP3_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY4_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY4:%.*]] ; CHECK: for.body4.us: ; CHECK-NEXT: [[J_065_US:%.*]] = phi i32 [ [[INC26_US:%.*]], [[FOR_COND8_FOR_COND_CLEANUP10_CRIT_EDGE_US:%.*]] ], [ [[I_067]], [[FOR_BODY4_US_PREHEADER]] ] ; CHECK-NEXT: [[MUL_US:%.*]] = mul nsw i32 [[J_065_US]], [[N]] ; CHECK-NEXT: [[ADD_US:%.*]] = add nsw i32 [[MUL_US]], [[I_067]] -; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds float, float* [[PA]], i32 [[ADD_US]] -; CHECK-NEXT: [[TMP26:%.*]] = load float, float* [[ARRAYIDX_US]], align 4 -; CHECK-NEXT: [[ARRAYIDX7_US:%.*]] = getelementptr inbounds float, float* [[PG]], i32 [[ADD_US]] -; CHECK-NEXT: store float [[TMP26]], float* [[ARRAYIDX7_US]], align 4 +; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds float, float* [[PA:%.*]], i32 [[ADD_US]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX_US]], align 4 +; CHECK-NEXT: [[ARRAYIDX7_US:%.*]] = getelementptr inbounds float, float* [[PG:%.*]], i32 [[ADD_US]] +; CHECK-NEXT: store float [[TMP0]], float* [[ARRAYIDX7_US]], align 4 ; CHECK-NEXT: br label [[FOR_BODY11_US:%.*]] ; CHECK: for.body11.us: -; CHECK-NEXT: [[TMP27:%.*]] = phi float [ [[TMP26]], [[FOR_BODY4_US]] ], [ [[SUB_US:%.*]], [[FOR_BODY11_US]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_BODY4_US]] ], [ [[SUB_US:%.*]], [[FOR_BODY11_US]] ] ; CHECK-NEXT: [[K_063_US:%.*]] = phi i32 [ 0, [[FOR_BODY4_US]] ], [ [[INC_US:%.*]], [[FOR_BODY11_US]] ] ; CHECK-NEXT: [[ADD16_US:%.*]] = add nsw i32 [[K_063_US]], [[MUL15]] ; CHECK-NEXT: [[ARRAYIDX17_US:%.*]] = getelementptr inbounds float, float* [[PG]], i32 [[ADD16_US]] -; CHECK-NEXT: [[TMP28:%.*]] = load float, float* [[ARRAYIDX17_US]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX17_US]], align 4 ; CHECK-NEXT: [[ADD19_US:%.*]] = add nsw i32 [[K_063_US]], [[MUL_US]] ; CHECK-NEXT: [[ARRAYIDX20_US:%.*]] = getelementptr inbounds float, float* [[PG]], i32 [[ADD19_US]] -; CHECK-NEXT: [[TMP29:%.*]] = load float, float* [[ARRAYIDX20_US]], align 4 -; CHECK-NEXT: [[MUL21_US:%.*]] = fmul fast float [[TMP29]], [[TMP28]] -; CHECK-NEXT: [[SUB_US]] = fsub fast float [[TMP27]], [[MUL21_US]] +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX20_US]], align 4 +; CHECK-NEXT: [[MUL21_US:%.*]] = fmul fast float [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[SUB_US]] = fsub fast float [[TMP1]], [[MUL21_US]] ; CHECK-NEXT: store float [[SUB_US]], float* [[ARRAYIDX7_US]], align 4 ; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[K_063_US]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_US]], [[I_067]] @@ -149,15 +65,15 @@ ; CHECK-NEXT: [[EXITCOND73_NOT:%.*]] = icmp eq i32 [[INC29]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND73_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY4_LR_PH]] ; CHECK: for.body4: -; CHECK-NEXT: [[J_065:%.*]] = phi i32 [ [[INC26:%.*]], [[FOR_BODY4]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[J_065:%.*]] = phi i32 [ [[INC26:%.*]], [[FOR_BODY4]] ], [ 0, [[FOR_BODY4_PREHEADER]] ] ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[J_065]], [[N]] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[PA]], i32 [[MUL]] -; CHECK-NEXT: [[TMP30:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[PG]], i32 [[MUL]] -; CHECK-NEXT: store float [[TMP30]], float* [[ARRAYIDX7]], align 4 +; CHECK-NEXT: store float [[TMP4]], float* [[ARRAYIDX7]], align 4 ; CHECK-NEXT: [[INC26]] = add nuw nsw i32 [[J_065]], 1 ; CHECK-NEXT: [[EXITCOND72_NOT:%.*]] = icmp eq i32 [[INC26]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND72_NOT]], label [[FOR_COND_CLEANUP3_LOOPEXIT]], label [[FOR_BODY4]], [[LOOP7:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND72_NOT]], label [[FOR_COND_CLEANUP3_LOOPEXIT:%.*]], label [[FOR_BODY4]] ; entry: %cmp66 = icmp sgt i32 %n, 0 Index: llvm/test/Transforms/LoopVectorize/X86/small-size.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/small-size.ll +++ llvm/test/Transforms/LoopVectorize/X86/small-size.ll @@ -160,109 +160,100 @@ ; CHECK: pred.load.if: ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP25]], align 4 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY9]] ], [ [[TMP27]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[TMP23]], i32 1 -; CHECK-NEXT: br i1 [[TMP29]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]] +; CHECK-NEXT: [[TMP27:%.*]] = phi i32 [ poison, [[VECTOR_BODY9]] ], [ [[TMP26]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[TMP23]], i32 1 +; CHECK-NEXT: br i1 [[TMP28]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]] ; CHECK: pred.load.if30: -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP20]] -; CHECK-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP31]], i32 1 +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP20]] +; CHECK-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP29]], align 4 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE31]] ; CHECK: pred.load.continue31: -; CHECK-NEXT: [[TMP33:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP32]], [[PRED_LOAD_IF30]] ] -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP23]], i32 2 -; CHECK-NEXT: br i1 [[TMP34]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33:%.*]] +; CHECK-NEXT: [[TMP31:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP30]], [[PRED_LOAD_IF30]] ] +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i1> [[TMP23]], i32 2 +; CHECK-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33:%.*]] ; CHECK: pred.load.if32: -; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP21]] -; CHECK-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP35]], align 4 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP36]], i32 2 +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP21]] +; CHECK-NEXT: [[TMP34:%.*]] = load i32, i32* [[TMP33]], align 4 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE33]] ; CHECK: pred.load.continue33: -; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_LOAD_CONTINUE31]] ], [ [[TMP37]], [[PRED_LOAD_IF32]] ] -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i1> [[TMP23]], i32 3 -; CHECK-NEXT: br i1 [[TMP39]], label [[PRED_LOAD_IF34:%.*]], label [[PRED_LOAD_CONTINUE35:%.*]] +; CHECK-NEXT: [[TMP35:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE31]] ], [ [[TMP34]], [[PRED_LOAD_IF32]] ] +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x i1> [[TMP23]], i32 3 +; CHECK-NEXT: br i1 [[TMP36]], label [[PRED_LOAD_IF34:%.*]], label [[PRED_LOAD_CONTINUE35:%.*]] ; CHECK: pred.load.if34: -; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP22]] -; CHECK-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP40]], align 4 -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP41]], i32 3 +; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP22]] +; CHECK-NEXT: [[TMP38:%.*]] = load i32, i32* [[TMP37]], align 4 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE35]] ; CHECK: pred.load.continue35: -; CHECK-NEXT: [[TMP43:%.*]] = phi <4 x i32> [ [[TMP38]], [[PRED_LOAD_CONTINUE33]] ], [ [[TMP42]], [[PRED_LOAD_IF34]] ] -; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i1> [[TMP23]], i32 0 -; CHECK-NEXT: br i1 [[TMP44]], label [[PRED_LOAD_IF36:%.*]], label [[PRED_LOAD_CONTINUE37:%.*]] +; CHECK-NEXT: [[TMP39:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE33]] ], [ [[TMP38]], [[PRED_LOAD_IF34]] ] +; CHECK-NEXT: [[TMP40:%.*]] = extractelement <4 x i1> [[TMP23]], i32 0 +; CHECK-NEXT: br i1 [[TMP40]], label [[PRED_LOAD_IF36:%.*]], label [[PRED_LOAD_CONTINUE37:%.*]] ; CHECK: pred.load.if36: -; CHECK-NEXT: [[TMP45:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i32> poison, i32 [[TMP46]], i32 0 +; CHECK-NEXT: [[TMP41:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP42:%.*]] = load i32, i32* [[TMP41]], align 4 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE37]] ; CHECK: pred.load.continue37: -; CHECK-NEXT: [[TMP48:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE35]] ], [ [[TMP47]], [[PRED_LOAD_IF36]] ] -; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i1> [[TMP23]], i32 1 -; CHECK-NEXT: br i1 [[TMP49]], label [[PRED_LOAD_IF38:%.*]], label [[PRED_LOAD_CONTINUE39:%.*]] +; CHECK-NEXT: [[TMP43:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE35]] ], [ [[TMP42]], [[PRED_LOAD_IF36]] ] +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i1> [[TMP23]], i32 1 +; CHECK-NEXT: br i1 [[TMP44]], label [[PRED_LOAD_IF38:%.*]], label [[PRED_LOAD_CONTINUE39:%.*]] ; CHECK: pred.load.if38: -; CHECK-NEXT: [[TMP50:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[TMP20]] -; CHECK-NEXT: [[TMP51:%.*]] = load i32, i32* [[TMP50]], align 4 -; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i32> [[TMP48]], i32 [[TMP51]], i32 1 +; CHECK-NEXT: [[TMP45:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[TMP20]] +; CHECK-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP45]], align 4 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE39]] ; CHECK: pred.load.continue39: -; CHECK-NEXT: [[TMP53:%.*]] = phi <4 x i32> [ [[TMP48]], [[PRED_LOAD_CONTINUE37]] ], [ [[TMP52]], [[PRED_LOAD_IF38]] ] -; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i1> [[TMP23]], i32 2 -; CHECK-NEXT: br i1 [[TMP54]], label [[PRED_LOAD_IF40:%.*]], label [[PRED_LOAD_CONTINUE41:%.*]] +; CHECK-NEXT: [[TMP47:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE37]] ], [ [[TMP46]], [[PRED_LOAD_IF38]] ] +; CHECK-NEXT: [[TMP48:%.*]] = extractelement <4 x i1> [[TMP23]], i32 2 +; CHECK-NEXT: br i1 [[TMP48]], label [[PRED_LOAD_IF40:%.*]], label [[PRED_LOAD_CONTINUE41:%.*]] ; CHECK: pred.load.if40: -; CHECK-NEXT: [[TMP55:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[TMP21]] -; CHECK-NEXT: [[TMP56:%.*]] = load i32, i32* [[TMP55]], align 4 -; CHECK-NEXT: [[TMP57:%.*]] = insertelement <4 x i32> [[TMP53]], i32 [[TMP56]], i32 2 +; CHECK-NEXT: [[TMP49:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[TMP21]] +; CHECK-NEXT: [[TMP50:%.*]] = load i32, i32* [[TMP49]], align 4 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE41]] ; CHECK: pred.load.continue41: -; CHECK-NEXT: [[TMP58:%.*]] = phi <4 x i32> [ [[TMP53]], [[PRED_LOAD_CONTINUE39]] ], [ [[TMP57]], [[PRED_LOAD_IF40]] ] -; CHECK-NEXT: [[TMP59:%.*]] = extractelement <4 x i1> [[TMP23]], i32 3 -; CHECK-NEXT: br i1 [[TMP59]], label [[PRED_LOAD_IF42:%.*]], label [[PRED_LOAD_CONTINUE43:%.*]] +; CHECK-NEXT: [[TMP51:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE39]] ], [ [[TMP50]], [[PRED_LOAD_IF40]] ] +; CHECK-NEXT: [[TMP52:%.*]] = extractelement <4 x i1> [[TMP23]], i32 3 +; CHECK-NEXT: br i1 [[TMP52]], label [[PRED_LOAD_IF42:%.*]], label [[PRED_LOAD_CONTINUE43:%.*]] ; CHECK: pred.load.if42: -; CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[TMP22]] -; CHECK-NEXT: [[TMP61:%.*]] = load i32, i32* [[TMP60]], align 4 -; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x i32> [[TMP58]], i32 [[TMP61]], i32 3 +; CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[TMP22]] +; CHECK-NEXT: [[TMP54:%.*]] = load i32, i32* [[TMP53]], align 4 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE43]] ; CHECK: pred.load.continue43: -; CHECK-NEXT: [[TMP63:%.*]] = phi <4 x i32> [ [[TMP58]], [[PRED_LOAD_CONTINUE41]] ], [ [[TMP62]], [[PRED_LOAD_IF42]] ] -; CHECK-NEXT: [[TMP64:%.*]] = and <4 x i32> [[TMP63]], [[TMP43]] -; CHECK-NEXT: [[TMP65:%.*]] = extractelement <4 x i1> [[TMP23]], i32 0 -; CHECK-NEXT: br i1 [[TMP65]], label [[PRED_STORE_IF44:%.*]], label [[PRED_STORE_CONTINUE45:%.*]] +; CHECK-NEXT: [[TMP55:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE41]] ], [ [[TMP54]], [[PRED_LOAD_IF42]] ] +; CHECK-NEXT: [[TMP56:%.*]] = extractelement <4 x i1> [[TMP23]], i32 0 +; CHECK-NEXT: br i1 [[TMP56]], label [[PRED_STORE_IF44:%.*]], label [[PRED_STORE_CONTINUE45:%.*]] ; CHECK: pred.store.if44: -; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP67:%.*]] = extractelement <4 x i32> [[TMP64]], i32 0 -; CHECK-NEXT: store i32 [[TMP67]], i32* [[TMP66]], align 4 +; CHECK-NEXT: [[TMP57:%.*]] = and i32 [[TMP43]], [[TMP27]] +; CHECK-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[OFFSET_IDX]] +; CHECK-NEXT: store i32 [[TMP57]], i32* [[TMP58]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE45]] ; CHECK: pred.store.continue45: -; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i1> [[TMP23]], i32 1 -; CHECK-NEXT: br i1 [[TMP68]], label [[PRED_STORE_IF46:%.*]], label [[PRED_STORE_CONTINUE47:%.*]] +; CHECK-NEXT: [[TMP59:%.*]] = extractelement <4 x i1> [[TMP23]], i32 1 +; CHECK-NEXT: br i1 [[TMP59]], label [[PRED_STORE_IF46:%.*]], label [[PRED_STORE_CONTINUE47:%.*]] ; CHECK: pred.store.if46: -; CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP20]] -; CHECK-NEXT: [[TMP70:%.*]] = extractelement <4 x i32> [[TMP64]], i32 1 -; CHECK-NEXT: store i32 [[TMP70]], i32* [[TMP69]], align 4 +; CHECK-NEXT: [[TMP60:%.*]] = and i32 [[TMP47]], [[TMP31]] +; CHECK-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP20]] +; CHECK-NEXT: store i32 [[TMP60]], i32* [[TMP61]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE47]] ; CHECK: pred.store.continue47: -; CHECK-NEXT: [[TMP71:%.*]] = extractelement <4 x i1> [[TMP23]], i32 2 -; CHECK-NEXT: br i1 [[TMP71]], label [[PRED_STORE_IF48:%.*]], label [[PRED_STORE_CONTINUE49:%.*]] +; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i1> [[TMP23]], i32 2 +; CHECK-NEXT: br i1 [[TMP62]], label [[PRED_STORE_IF48:%.*]], label [[PRED_STORE_CONTINUE49:%.*]] ; CHECK: pred.store.if48: -; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP21]] -; CHECK-NEXT: [[TMP73:%.*]] = extractelement <4 x i32> [[TMP64]], i32 2 -; CHECK-NEXT: store i32 [[TMP73]], i32* [[TMP72]], align 4 +; CHECK-NEXT: [[TMP63:%.*]] = and i32 [[TMP51]], [[TMP35]] +; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP21]] +; CHECK-NEXT: store i32 [[TMP63]], i32* [[TMP64]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE49]] ; CHECK: pred.store.continue49: -; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x i1> [[TMP23]], i32 3 -; CHECK-NEXT: br i1 [[TMP74]], label [[PRED_STORE_IF50:%.*]], label [[PRED_STORE_CONTINUE51]] +; CHECK-NEXT: [[TMP65:%.*]] = extractelement <4 x i1> [[TMP23]], i32 3 +; CHECK-NEXT: br i1 [[TMP65]], label [[PRED_STORE_IF50:%.*]], label [[PRED_STORE_CONTINUE51]] ; CHECK: pred.store.if50: -; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP22]] -; CHECK-NEXT: [[TMP76:%.*]] = extractelement <4 x i32> [[TMP64]], i32 3 -; CHECK-NEXT: store i32 [[TMP76]], i32* [[TMP75]], align 4 +; CHECK-NEXT: [[TMP66:%.*]] = and i32 [[TMP55]], [[TMP39]] +; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP22]] +; CHECK-NEXT: store i32 [[TMP66]], i32* [[TMP67]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE51]] ; CHECK: pred.store.continue51: ; CHECK-NEXT: [[INDEX_NEXT15]] = add i64 [[INDEX14]], 4 -; CHECK-NEXT: [[TMP77:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC13]] -; CHECK-NEXT: br i1 [[TMP77]], label [[MIDDLE_BLOCK7:%.*]], label [[VECTOR_BODY9]], [[LOOP5:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP68:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC13]] +; CHECK-NEXT: br i1 [[TMP68]], label [[MIDDLE_BLOCK7:%.*]], label [[VECTOR_BODY9]], [[LOOP5:!llvm.loop !.*]] ; CHECK: middle.block7: ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH8]] ; CHECK: scalar.ph8: