Index: llvm/lib/Transforms/InstCombine/InstructionCombining.cpp =================================================================== --- llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -138,6 +138,10 @@ EnableCodeSinking("instcombine-code-sinking", cl::desc("Enable code sinking"), cl::init(true)); +static cl::opt MaxSinkNumUsers( + "instcombine-max-sink-users", cl::init(32), + cl::desc("Maximum number of undroppable users for instruction sinking")); + static cl::opt LimitMaxIterations( "instcombine-max-iterations", cl::desc("Limit the maximum number of instruction combining iterations"), @@ -3859,7 +3863,6 @@ /// block. static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock, TargetLibraryInfo &TLI) { - assert(I->getUniqueUndroppableUser() && "Invariants didn't hold!"); BasicBlock *SrcBlock = I->getParent(); // Cannot move control-flow-involving, volatile loads, vaarg, etc. @@ -4026,48 +4029,68 @@ [this](Instruction *I) -> Optional { if (!EnableCodeSinking) return None; - auto *UserInst = cast_or_null(I->getUniqueUndroppableUser()); - if (!UserInst) - return None; BasicBlock *BB = I->getParent(); BasicBlock *UserParent = nullptr; + unsigned NumUsers = 0; - // Special handling for Phi nodes - get the block the use occurs in. - if (PHINode *PN = dyn_cast(UserInst)) { - for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) { - if (PN->getIncomingValue(i) == I) { - // Bail out if we have uses in different blocks. We don't do any - // sophisticated analysis (i.e finding NearestCommonDominator of these - // use blocks). - if (UserParent && UserParent != PN->getIncomingBlock(i)) - return None; - UserParent = PN->getIncomingBlock(i); + for (auto *U : I->users()) { + if (U->isDroppable()) + continue; + if (NumUsers > MaxSinkNumUsers) + return None; + + Instruction *UserInst = cast(U); + // Special handling for Phi nodes - get the block the use occurs in. + if (PHINode *PN = dyn_cast(UserInst)) { + for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) { + if (PN->getIncomingValue(i) == I) { + // Bail out if we have uses in different blocks. We don't do any + // sophisticated analysis (i.e finding NearestCommonDominator of + // these use blocks). + if (UserParent && UserParent != PN->getIncomingBlock(i)) + return None; + UserParent = PN->getIncomingBlock(i); + } } + assert(UserParent && "expected to find user block!"); + } else { + if (UserParent && UserParent != UserInst->getParent()) + return None; + UserParent = UserInst->getParent(); } - assert(UserParent && "expected to find user block!"); - } else - UserParent = UserInst->getParent(); - // Try sinking to another block. If that block is unreachable, then do - // not bother. SimplifyCFG should handle it. - if (UserParent == BB || !DT.isReachableFromEntry(UserParent)) - return None; + // Make sure these checks are done only once, naturally we do the checks + // the first time we get the userparent, this will save compile time. + if (NumUsers == 0) { + // Try sinking to another block. If that block is unreachable, then do + // not bother. SimplifyCFG should handle it. + if (UserParent == BB || !DT.isReachableFromEntry(UserParent)) + return None; + + auto *Term = UserParent->getTerminator(); + // See if the user is one of our successors that has only one + // predecessor, so that we don't have to split the critical edge. + // Another option where we can sink is a block that ends with a + // terminator that does not pass control to other block (such as + // return or unreachable or resume). In this case: + // - I dominates the User (by SSA form); + // - the User will be executed at most once. + // So sinking I down to User is always profitable or neutral. + if (UserParent->getUniquePredecessor() != BB && !succ_empty(Term)) + return None; + + assert(DT.dominates(BB, UserParent) && "Dominance relation broken?"); + } - auto *Term = UserParent->getTerminator(); - // See if the user is one of our successors that has only one - // predecessor, so that we don't have to split the critical edge. - // Another option where we can sink is a block that ends with a - // terminator that does not pass control to other block (such as - // return or unreachable or resume). In this case: - // - I dominates the User (by SSA form); - // - the User will be executed at most once. - // So sinking I down to User is always profitable or neutral. - if (UserParent->getUniquePredecessor() == BB || succ_empty(Term)) { - assert(DT.dominates(BB, UserParent) && "Dominance relation broken?"); - return UserParent; + NumUsers++; } - return None; + + // No user or only has droppable users. + if (!UserParent) + return None; + + return UserParent; }; auto OptBB = getOptionalSinkBlockForInst(I); Index: llvm/test/Transforms/InstCombine/intptr7.ll =================================================================== --- llvm/test/Transforms/InstCombine/intptr7.ll +++ llvm/test/Transforms/InstCombine/intptr7.ll @@ -4,17 +4,17 @@ define void @matching_phi(i64 %a, float* %b, i1 %cond) { ; CHECK-LABEL: @matching_phi( ; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB2:%.*]], label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[ADDB:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 2 +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: ; CHECK-NEXT: [[ADD_INT:%.*]] = add i64 [[A:%.*]], 1 ; CHECK-NEXT: [[ADD:%.*]] = inttoptr i64 [[ADD_INT]] to float* -; CHECK-NEXT: br i1 [[COND:%.*]], label [[BBB:%.*]], label [[A:%.*]] -; CHECK: A: -; CHECK-NEXT: [[ADDB:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 2 -; CHECK-NEXT: br label [[C:%.*]] -; CHECK: Bbb: ; CHECK-NEXT: store float 1.000000e+01, float* [[ADD]], align 4 -; CHECK-NEXT: br label [[C]] -; CHECK: C: -; CHECK-NEXT: [[A_ADDR_03:%.*]] = phi float* [ [[ADDB]], [[A]] ], [ [[ADD]], [[BBB]] ] +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: [[A_ADDR_03:%.*]] = phi float* [ [[ADDB]], [[BB1]] ], [ [[ADD]], [[BB2]] ] ; CHECK-NEXT: [[I1:%.*]] = load float, float* [[A_ADDR_03]], align 4 ; CHECK-NEXT: [[MUL_I:%.*]] = fmul float [[I1]], 4.200000e+01 ; CHECK-NEXT: store float [[MUL_I]], float* [[A_ADDR_03]], align 4 @@ -27,16 +27,16 @@ %addb = getelementptr inbounds float, float* %b, i64 2 %addb.int = ptrtoint float* %addb to i64 - br i1 %cmp1, label %A, label %Bbb -A: - br label %C -Bbb: + br i1 %cmp1, label %bb1, label %bb2 +bb1: + br label %bb3 +bb2: store float 1.0e+01, float* %add, align 4 - br label %C + br label %bb3 -C: - %a.addr.03 = phi float* [ %addb, %A ], [ %add, %Bbb ] - %b.addr.02 = phi i64 [ %addb.int, %A ], [ %add.int, %Bbb ] +bb3: + %a.addr.03 = phi float* [ %addb, %bb1 ], [ %add, %bb2 ] + %b.addr.02 = phi i64 [ %addb.int, %bb1 ], [ %add.int, %bb2 ] %i0 = inttoptr i64 %b.addr.02 to float* %i1 = load float, float* %i0, align 4 %mul.i = fmul float %i1, 4.200000e+01 @@ -48,18 +48,20 @@ ; CHECK-LABEL: @no_matching_phi( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ADD_INT:%.*]] = add i64 [[A:%.*]], 1 -; CHECK-NEXT: [[ADD:%.*]] = inttoptr i64 [[ADD_INT]] to float* ; CHECK-NEXT: [[ADDB:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 2 ; CHECK-NEXT: br i1 [[COND:%.*]], label [[B:%.*]], label [[A:%.*]] ; CHECK: A: ; CHECK-NEXT: br label [[C:%.*]] ; CHECK: B: +; CHECK-NEXT: [[ADDB_INT:%.*]] = ptrtoint float* [[ADDB]] to i64 +; CHECK-NEXT: [[ADD:%.*]] = inttoptr i64 [[ADD_INT]] to float* ; CHECK-NEXT: store float 1.000000e+01, float* [[ADD]], align 4 ; CHECK-NEXT: br label [[C]] ; CHECK: C: ; CHECK-NEXT: [[A_ADDR_03:%.*]] = phi float* [ [[ADDB]], [[A]] ], [ [[ADD]], [[B]] ] -; CHECK-NEXT: [[B_ADDR_02_PTR:%.*]] = phi float* [ [[ADD]], [[A]] ], [ [[ADDB]], [[B]] ] -; CHECK-NEXT: [[I1:%.*]] = load float, float* [[B_ADDR_02_PTR]], align 4 +; CHECK-NEXT: [[B_ADDR_02:%.*]] = phi i64 [ [[ADD_INT]], [[A]] ], [ [[ADDB_INT]], [[B]] ] +; CHECK-NEXT: [[I0:%.*]] = inttoptr i64 [[B_ADDR_02]] to float* +; CHECK-NEXT: [[I1:%.*]] = load float, float* [[I0]], align 4 ; CHECK-NEXT: [[MUL_I:%.*]] = fmul float [[I1]], 4.200000e+01 ; CHECK-NEXT: store float [[MUL_I]], float* [[A_ADDR_03]], align 4 ; CHECK-NEXT: ret void Index: llvm/test/Transforms/InstCombine/lifetime-no-null-opt.ll =================================================================== --- llvm/test/Transforms/InstCombine/lifetime-no-null-opt.ll +++ llvm/test/Transforms/InstCombine/lifetime-no-null-opt.ll @@ -11,17 +11,17 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TEXT:%.*]] = alloca [1 x i8], align 1 ; CHECK-NEXT: [[BUFF:%.*]] = alloca [1 x i8], align 1 -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x i8], [1 x i8]* [[TEXT]], i64 0, i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8], [1 x i8]* [[BUFF]], i64 0, i64 0 ; CHECK-NEXT: br i1 [[FLAG:%.*]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb3: -; CHECK-NEXT: call void @llvm.dbg.declare(metadata [1 x i8]* [[TEXT]], [[META16:metadata !.*]], metadata !DIExpression()), [[DBG24:!dbg !.*]] +; CHECK-NEXT: call void @llvm.dbg.declare(metadata [1 x i8]* [[TEXT]], metadata [[META16:![0-9]+]], metadata !DIExpression()), !dbg [[DBG24:![0-9]+]] ; CHECK-NEXT: br label [[FIN:%.*]] ; CHECK: else: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x i8], [1 x i8]* [[TEXT]], i64 0, i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8], [1 x i8]* [[BUFF]], i64 0, i64 0 ; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 1, i8* [[TMP0]]) ; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 1, i8* [[TMP1]]) ; CHECK-NEXT: call void @foo(i8* [[TMP1]], i8* [[TMP0]]) Index: llvm/test/Transforms/InstCombine/lifetime.ll =================================================================== --- llvm/test/Transforms/InstCombine/lifetime.ll +++ llvm/test/Transforms/InstCombine/lifetime.ll @@ -11,17 +11,17 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TEXT:%.*]] = alloca [1 x i8], align 1 ; CHECK-NEXT: [[BUFF:%.*]] = alloca [1 x i8], align 1 -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x i8], [1 x i8]* [[TEXT]], i64 0, i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8], [1 x i8]* [[BUFF]], i64 0, i64 0 ; CHECK-NEXT: br i1 [[FLAG:%.*]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb3: -; CHECK-NEXT: call void @llvm.dbg.declare(metadata [1 x i8]* [[TEXT]], [[META16:metadata !.*]], metadata !DIExpression()), [[DBG24:!dbg !.*]] +; CHECK-NEXT: call void @llvm.dbg.declare(metadata [1 x i8]* [[TEXT]], metadata [[META16:![0-9]+]], metadata !DIExpression()), !dbg [[DBG24:![0-9]+]] ; CHECK-NEXT: br label [[FIN:%.*]] ; CHECK: else: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x i8], [1 x i8]* [[TEXT]], i64 0, i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8], [1 x i8]* [[BUFF]], i64 0, i64 0 ; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 1, i8* nonnull [[TMP0]]) ; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 1, i8* nonnull [[TMP1]]) ; CHECK-NEXT: call void @foo(i8* nonnull [[TMP1]], i8* nonnull [[TMP0]]) Index: llvm/test/Transforms/InstCombine/merging-multiple-stores-into-successor.ll =================================================================== --- llvm/test/Transforms/InstCombine/merging-multiple-stores-into-successor.ll +++ llvm/test/Transforms/InstCombine/merging-multiple-stores-into-successor.ll @@ -15,7 +15,6 @@ ; CHECK-NEXT: [[I:%.*]] = load i8, i8* @var_7, align 1 ; CHECK-NEXT: [[I1:%.*]] = icmp eq i8 [[I]], -1 ; CHECK-NEXT: [[I4:%.*]] = load i16, i16* @var_0, align 2 -; CHECK-NEXT: [[I8:%.*]] = sext i16 [[I4]] to i32 ; CHECK-NEXT: br i1 [[I1]], label [[BB10:%.*]], label [[BB9:%.*]] ; CHECK: bb9: ; CHECK-NEXT: br label [[BB12:%.*]] @@ -31,6 +30,7 @@ ; CHECK-NEXT: [[STOREMERGE1:%.*]] = phi i32 [ [[I11]], [[BB10]] ], [ 1, [[BB9]] ] ; CHECK-NEXT: store i32 [[STOREMERGE1]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @arr_2, i64 0, i64 0), align 4 ; CHECK-NEXT: store i16 [[I4]], i16* getelementptr inbounds ([0 x i16], [0 x i16]* @arr_4, i64 0, i64 0), align 2 +; CHECK-NEXT: [[I8:%.*]] = sext i16 [[I4]] to i32 ; CHECK-NEXT: store i32 [[I8]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @arr_3, i64 0, i64 0), align 16 ; CHECK-NEXT: store i32 [[STOREMERGE1]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @arr_2, i64 0, i64 1), align 4 ; CHECK-NEXT: store i16 [[I4]], i16* getelementptr inbounds ([0 x i16], [0 x i16]* @arr_4, i64 0, i64 1), align 2 Index: llvm/test/Transforms/InstCombine/pr33689_same_bitwidth.ll =================================================================== --- llvm/test/Transforms/InstCombine/pr33689_same_bitwidth.ll +++ llvm/test/Transforms/InstCombine/pr33689_same_bitwidth.ll @@ -14,11 +14,11 @@ ; CHECK-LABEL: @f( ; CHECK-NEXT: bb0: ; CHECK-NEXT: [[T12:%.*]] = alloca [2 x i32], align 8 -; CHECK-NEXT: [[T12_SUB:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[T12]], i16 0, i16 0 ; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] ; CHECK: bb1: ; CHECK-NEXT: unreachable ; CHECK: bb2: +; CHECK-NEXT: [[T12_SUB:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[T12]], i16 0, i16 0 ; CHECK-NEXT: [[T9:%.*]] = load i16*, i16** @b, align 2 ; CHECK-NEXT: store i16 0, i16* [[T9]], align 2 ; CHECK-NEXT: [[T10:%.*]] = load i32, i32* [[T12_SUB]], align 8 Index: llvm/test/Transforms/InstCombine/shift-by-signext.ll =================================================================== --- llvm/test/Transforms/InstCombine/shift-by-signext.ll +++ llvm/test/Transforms/InstCombine/shift-by-signext.ll @@ -69,11 +69,11 @@ define i32 @t6_twoshifts(i32 %x, i8 %shamt) { ; CHECK-LABEL: @t6_twoshifts( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[SHAMT_WIDE:%.*]] = sext i8 [[SHAMT:%.*]] to i32 ; CHECK-NEXT: br label [[WORK:%.*]] ; CHECK: work: ; CHECK-NEXT: br label [[END:%.*]] ; CHECK: end: +; CHECK-NEXT: [[SHAMT_WIDE:%.*]] = sext i8 [[SHAMT:%.*]] to i32 ; CHECK-NEXT: [[N0:%.*]] = shl i32 [[X:%.*]], [[SHAMT_WIDE]] ; CHECK-NEXT: [[R:%.*]] = ashr i32 [[N0]], [[SHAMT_WIDE]] ; CHECK-NEXT: ret i32 [[R]] @@ -151,11 +151,11 @@ } define i32 @n12_twoshifts_and_extrause(i32 %x, i8 %shamt) { ; CHECK-LABEL: @n12_twoshifts_and_extrause( -; CHECK-NEXT: [[SHAMT_WIDE:%.*]] = sext i8 [[SHAMT:%.*]] to i32 ; CHECK-NEXT: br label [[WORK:%.*]] ; CHECK: work: ; CHECK-NEXT: br label [[END:%.*]] ; CHECK: end: +; CHECK-NEXT: [[SHAMT_WIDE:%.*]] = sext i8 [[SHAMT:%.*]] to i32 ; CHECK-NEXT: [[N0:%.*]] = shl i32 [[X:%.*]], [[SHAMT_WIDE]] ; CHECK-NEXT: [[R:%.*]] = ashr i32 [[N0]], [[SHAMT_WIDE]] ; CHECK-NEXT: call void @use32(i32 [[SHAMT_WIDE]]) Index: llvm/test/Transforms/InstCombine/sink_instruction.ll =================================================================== --- llvm/test/Transforms/InstCombine/sink_instruction.ll +++ llvm/test/Transforms/InstCombine/sink_instruction.ll @@ -41,7 +41,7 @@ ; CHECK: bb1: ; CHECK-NEXT: [[TMP1:%.*]] = add nsw i32 [[X_ADDR_17]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = sdiv i32 [[TMP1]], [[X_ADDR_17]] -; CHECK-NEXT: [[TMP3:%.*]] = tail call i32 @bar() #[[ATTR1:[0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = tail call i32 @bar() #[[ATTR3:[0-9]+]] ; CHECK-NEXT: br label [[BB2]] ; CHECK: bb2: ; CHECK-NEXT: [[X_ADDR_0]] = phi i32 [ [[TMP2]], [[BB1]] ], [ [[X_ADDR_17]], [[BB]] ] @@ -179,10 +179,10 @@ define i32 @test6(i32* nocapture readonly %P, i32 %i, i1 %cond) { ; CHECK-LABEL: @test6( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ADD:%.*]] = shl nsw i32 [[I]], 1 +; CHECK-NEXT: [[ADD:%.*]] = shl nsw i32 [[I:%.*]], 1 ; CHECK-NEXT: br label [[DISPATCHBB:%.*]] ; CHECK: dispatchBB: -; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[I:%.*]] to i64 +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[I]] to i64 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 [[IDXPROM]] ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 ; CHECK-NEXT: switch i32 [[I]], label [[SW_BB:%.*]] [ @@ -237,3 +237,34 @@ else: ret void } + +declare void @abort() +declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64) +declare void @dummy(i64) +; Two uses in two different users of a single successor block. We can sink. +define i64 @test8(i64 %c) { +; CHECK-LABEL: @test8( +; CHECK-NEXT: bb1: +; CHECK-NEXT: [[OVERFLOW:%.*]] = icmp ugt i64 [[C:%.*]], 2305843009213693951 +; CHECK-NEXT: br i1 [[OVERFLOW]], label [[ABORT:%.*]], label [[BB2:%.*]] +; CHECK: bb2: +; CHECK-NEXT: call void @dummy(i64 8) +; CHECK-NEXT: ret i64 8 +; CHECK: abort: +; CHECK-NEXT: call void @abort() +; CHECK-NEXT: unreachable +; +bb1: + %mul = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %c, i64 8) + %overflow = extractvalue { i64, i1 } %mul, 1 + %select = select i1 %overflow, i64 0, i64 8 + br i1 %overflow, label %abort, label %bb2 + +bb2: + call void @dummy(i64 %select) + ret i64 %select + +abort: + call void @abort() + unreachable +} Index: llvm/test/Transforms/LoopUnroll/runtime-unroll-remainder.ll =================================================================== --- llvm/test/Transforms/LoopUnroll/runtime-unroll-remainder.ll +++ llvm/test/Transforms/LoopUnroll/runtime-unroll-remainder.ll @@ -33,20 +33,20 @@ ; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX2_EPIL]], align 4 ; CHECK-NEXT: [[MUL_EPIL:%.*]] = mul nsw i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[ADD_EPIL:%.*]] = add nsw i32 [[MUL_EPIL]], [[C_010_UNR]] -; CHECK-NEXT: [[INDVARS_IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[INDVARS_IV_UNR]], 1 ; CHECK-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 1 ; CHECK-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT_EPILOG_LCSSA:%.*]], label [[FOR_BODY_EPIL_1:%.*]] ; CHECK: for.body.epil.1: +; CHECK-NEXT: [[INDVARS_IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[INDVARS_IV_UNR]], 1 ; CHECK-NEXT: [[ARRAYIDX_EPIL_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_EPIL]] ; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_EPIL_1]], align 4 ; CHECK-NEXT: [[ARRAYIDX2_EPIL_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT_EPIL]] ; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX2_EPIL_1]], align 4 ; CHECK-NEXT: [[MUL_EPIL_1:%.*]] = mul nsw i32 [[TMP5]], [[TMP4]] ; CHECK-NEXT: [[ADD_EPIL_1:%.*]] = add nsw i32 [[MUL_EPIL_1]], [[ADD_EPIL]] -; CHECK-NEXT: [[INDVARS_IV_NEXT_EPIL_1:%.*]] = add nuw nsw i64 [[INDVARS_IV_UNR]], 2 ; CHECK-NEXT: [[EPIL_ITER_CMP_1_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 2 ; CHECK-NEXT: br i1 [[EPIL_ITER_CMP_1_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT_EPILOG_LCSSA]], label [[FOR_BODY_EPIL_2:%.*]] ; CHECK: for.body.epil.2: +; CHECK-NEXT: [[INDVARS_IV_NEXT_EPIL_1:%.*]] = add nuw nsw i64 [[INDVARS_IV_UNR]], 2 ; CHECK-NEXT: [[ARRAYIDX_EPIL_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_EPIL_1]] ; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_EPIL_2]], align 4 ; CHECK-NEXT: [[ARRAYIDX2_EPIL_2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT_EPIL_1]] Index: llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll +++ llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll @@ -32,15 +32,15 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <16 x i32*> [[BROADCAST_SPLAT]], zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>* -; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT9]], <16 x i32>* [[TMP2]], align 4, !alias.scope !0, !noalias !3 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>* +; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT9]], <16 x i32>* [[TMP1]], align 4, !alias.scope !0, !noalias !3 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[BROADCAST_SPLAT]], i32 4, <16 x i1> [[TMP1]], <16 x i32> undef), !alias.scope !3 -; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP1]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32*> [[BROADCAST_SPLAT]], zeroinitializer +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[BROADCAST_SPLAT]], i32 4, <16 x i1> [[TMP3]], <16 x i32> undef), !alias.scope !3 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP3]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]] ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x i32> [[PREDPHI]], i64 15 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -52,26 +52,26 @@ ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[SMAX11:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1) ; CHECK-NEXT: [[N_VEC13:%.*]] = and i64 [[SMAX11]], 9223372036854775800 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <8 x i32*> poison, i32* [[A]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT19:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT18]], <8 x i32*> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT21:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT20]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <8 x i32*> poison, i32* [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT18:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT17]], <8 x i32*> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT20:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT19]], <8 x i32> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX14:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX14]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <8 x i32*> [[BROADCAST_SPLAT19]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>* -; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT21]], <8 x i32>* [[TMP7]], align 4 -; CHECK-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX14]], 8 -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC13]] -; CHECK-NEXT: br i1 [[TMP8]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT23:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>* +; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT20]], <8 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: [[INDEX_NEXT23]] = add nuw i64 [[OFFSET_IDX]], 8 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT23]], [[N_VEC13]] +; CHECK-NEXT: br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[WIDE_MASKED_GATHER22:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[BROADCAST_SPLAT19]], i32 4, <8 x i1> [[TMP6]], <8 x i32> undef) -; CHECK-NEXT: [[PREDPHI23:%.*]] = select <8 x i1> [[TMP6]], <8 x i32> [[WIDE_MASKED_GATHER22]], <8 x i32> -; CHECK-NEXT: [[CMP_N16:%.*]] = icmp eq i64 [[SMAX11]], [[N_VEC13]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[PREDPHI23]], i64 7 -; CHECK-NEXT: br i1 [[CMP_N16]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <8 x i32*> [[BROADCAST_SPLAT18]], zeroinitializer +; CHECK-NEXT: [[WIDE_MASKED_GATHER21:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[BROADCAST_SPLAT18]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef) +; CHECK-NEXT: [[PREDPHI22:%.*]] = select <8 x i1> [[TMP8]], <8 x i32> [[WIDE_MASKED_GATHER21]], <8 x i32> +; CHECK-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[SMAX11]], [[N_VEC13]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[PREDPHI22]], i64 7 +; CHECK-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] Index: llvm/test/Transforms/LoopVectorize/X86/small-size.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/small-size.ll +++ llvm/test/Transforms/LoopVectorize/X86/small-size.ll @@ -76,10 +76,10 @@ ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[DOTLR_PH5_PREHEADER:%.*]], label [[DOTPREHEADER:%.*]] ; CHECK: .lr.ph5.preheader: -; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[N]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64 ; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw nsw i64 [[TMP3]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N_RND_UP]], 8589934588 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP3]], i64 0 @@ -136,24 +136,24 @@ ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[N]], 0 ; CHECK-NEXT: br i1 [[TMP17]], label [[DOT_CRIT_EDGE:%.*]], label [[DOTLR_PH_PREHEADER:%.*]] ; CHECK: .lr.ph.preheader: -; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[N]], -1 -; CHECK-NEXT: [[TMP19:%.*]] = zext i32 [[TMP18]] to i64 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH8:%.*]], label [[VECTOR_PH10:%.*]] ; CHECK: vector.ph10: +; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP19:%.*]] = zext i32 [[TMP18]] to i64 ; CHECK-NEXT: [[N_RND_UP11:%.*]] = add nuw nsw i64 [[TMP19]], 4 ; CHECK-NEXT: [[N_VEC13:%.*]] = and i64 [[N_RND_UP11]], 8589934588 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <4 x i64> poison, i64 [[TMP19]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT19:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT18]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY9:%.*]] ; CHECK: vector.body9: -; CHECK-NEXT: [[INDEX38:%.*]] = phi i64 [ 0, [[VECTOR_PH10]] ], [ [[INDEX_NEXT37:%.*]], [[PRED_STORE_CONTINUE36:%.*]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[I_0_LCSSA]], [[INDEX38]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX38]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT28:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT27]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[VEC_IV:%.*]] = or <4 x i64> [[BROADCAST_SPLAT28]], +; CHECK-NEXT: [[INDEX20:%.*]] = phi i64 [ 0, [[VECTOR_PH10]] ], [ [[INDEX_NEXT31:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[I_0_LCSSA]], [[INDEX20]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT21:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX20]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT22:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT21]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or <4 x i64> [[BROADCAST_SPLAT22]], ; CHECK-NEXT: [[TMP20:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT19]] ; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP20]], i64 0 -; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]] +; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] ; CHECK: pred.store.if23: ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4 @@ -162,10 +162,10 @@ ; CHECK-NEXT: [[TMP26:%.*]] = and i32 [[TMP25]], [[TMP23]] ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[OFFSET_IDX]] ; CHECK-NEXT: store i32 [[TMP26]], i32* [[TMP27]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE30]] +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE24]] ; CHECK: pred.store.continue24: ; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[TMP20]], i64 1 -; CHECK-NEXT: br i1 [[TMP28]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32:%.*]] +; CHECK-NEXT: br i1 [[TMP28]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] ; CHECK: pred.store.if25: ; CHECK-NEXT: [[TMP29:%.*]] = add i64 [[OFFSET_IDX]], 1 ; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP29]] @@ -175,10 +175,10 @@ ; CHECK-NEXT: [[TMP34:%.*]] = and i32 [[TMP33]], [[TMP31]] ; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP29]] ; CHECK-NEXT: store i32 [[TMP34]], i32* [[TMP35]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE32]] +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE26]] ; CHECK: pred.store.continue26: ; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x i1> [[TMP20]], i64 2 -; CHECK-NEXT: br i1 [[TMP36]], label [[PRED_STORE_IF33:%.*]], label [[PRED_STORE_CONTINUE34:%.*]] +; CHECK-NEXT: br i1 [[TMP36]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] ; CHECK: pred.store.if27: ; CHECK-NEXT: [[TMP37:%.*]] = add i64 [[OFFSET_IDX]], 2 ; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP37]] @@ -188,10 +188,10 @@ ; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], [[TMP39]] ; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP37]] ; CHECK-NEXT: store i32 [[TMP42]], i32* [[TMP43]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE34]] +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE28]] ; CHECK: pred.store.continue28: ; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i1> [[TMP20]], i64 3 -; CHECK-NEXT: br i1 [[TMP44]], label [[PRED_STORE_IF35:%.*]], label [[PRED_STORE_CONTINUE36]] +; CHECK-NEXT: br i1 [[TMP44]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]] ; CHECK: pred.store.if29: ; CHECK-NEXT: [[TMP45:%.*]] = add i64 [[OFFSET_IDX]], 3 ; CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP45]] @@ -201,10 +201,10 @@ ; CHECK-NEXT: [[TMP50:%.*]] = and i32 [[TMP49]], [[TMP47]] ; CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP45]] ; CHECK-NEXT: store i32 [[TMP50]], i32* [[TMP51]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE36]] +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE30]] ; CHECK: pred.store.continue30: -; CHECK-NEXT: [[INDEX_NEXT37]] = add i64 [[INDEX38]], 4 -; CHECK-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT37]], [[N_VEC13]] +; CHECK-NEXT: [[INDEX_NEXT31]] = add i64 [[INDEX20]], 4 +; CHECK-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT31]], [[N_VEC13]] ; CHECK-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK7:%.*]], label [[VECTOR_BODY9]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block7: ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH8]] @@ -266,62 +266,62 @@ ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[DOT_CRIT_EDGE:%.*]], label [[DOTLR_PH_PREHEADER:%.*]] ; CHECK: .lr.ph.preheader: -; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[N]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64 ; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw nsw i64 [[TMP3]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N_RND_UP]], 8589934588 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP3]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE21:%.*]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT14]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[VEC_IV:%.*]] = or <4 x i64> [[BROADCAST_SPLAT15]], +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE19:%.*]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT12]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or <4 x i64> [[BROADCAST_SPLAT13]], ; CHECK-NEXT: [[TMP4:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i64 0 ; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[P:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i32, i32* [[Q:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[NEXT_GEP10]], align 16 +; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i32, i32* [[Q:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[NEXT_GEP8]], align 16 ; CHECK-NEXT: store i32 [[TMP6]], i32* [[NEXT_GEP]], align 16 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP4]], i64 1 -; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]] +; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]] ; CHECK: pred.store.if14: ; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i32, i32* [[P]], i64 [[TMP8]] +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i32, i32* [[P]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP9:%.*]] = or i64 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP11:%.*]] = getelementptr i32, i32* [[Q]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[NEXT_GEP11]], align 16 -; CHECK-NEXT: store i32 [[TMP10]], i32* [[NEXT_GEP7]], align 16 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE17]] +; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i32, i32* [[Q]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[NEXT_GEP9]], align 16 +; CHECK-NEXT: store i32 [[TMP10]], i32* [[NEXT_GEP5]], align 16 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE15]] ; CHECK: pred.store.continue15: ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP4]], i64 2 -; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]] +; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]] ; CHECK: pred.store.if16: ; CHECK-NEXT: [[TMP12:%.*]] = or i64 [[INDEX]], 2 -; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i32, i32* [[P]], i64 [[TMP12]] +; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i32, i32* [[P]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP13:%.*]] = or i64 [[INDEX]], 2 -; CHECK-NEXT: [[NEXT_GEP12:%.*]] = getelementptr i32, i32* [[Q]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[NEXT_GEP12]], align 16 -; CHECK-NEXT: store i32 [[TMP14]], i32* [[NEXT_GEP8]], align 16 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE19]] +; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i32, i32* [[Q]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[NEXT_GEP10]], align 16 +; CHECK-NEXT: store i32 [[TMP14]], i32* [[NEXT_GEP6]], align 16 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE17]] ; CHECK: pred.store.continue17: ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP4]], i64 3 -; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21]] +; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19]] ; CHECK: pred.store.if18: ; CHECK-NEXT: [[TMP16:%.*]] = or i64 [[INDEX]], 3 -; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i32, i32* [[P]], i64 [[TMP16]] +; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i32, i32* [[P]], i64 [[TMP16]] ; CHECK-NEXT: [[TMP17:%.*]] = or i64 [[INDEX]], 3 -; CHECK-NEXT: [[NEXT_GEP13:%.*]] = getelementptr i32, i32* [[Q]], i64 [[TMP17]] -; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[NEXT_GEP13]], align 16 -; CHECK-NEXT: store i32 [[TMP18]], i32* [[NEXT_GEP9]], align 16 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE21]] +; CHECK-NEXT: [[NEXT_GEP11:%.*]] = getelementptr i32, i32* [[Q]], i64 [[TMP17]] +; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[NEXT_GEP11]], align 16 +; CHECK-NEXT: store i32 [[TMP18]], i32* [[NEXT_GEP7]], align 16 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE19]] ; CHECK: pred.store.continue19: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -455,8 +455,8 @@ ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE16:%.*]] ] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = or <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i64> [[INDUCTION]], +; CHECK-NEXT: [[VEC_IV:%.*]] = or <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i64> [[VEC_IV]], ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i64 0 ; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: Index: llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll +++ llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll @@ -133,8 +133,8 @@ ; DISABLED_MASKED_STRIDED-LABEL: @test2( ; DISABLED_MASKED_STRIDED-NEXT: entry: ; DISABLED_MASKED_STRIDED-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[NUMPOINTS:%.*]], 0 -; DISABLED_MASKED_STRIDED-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] -; DISABLED_MASKED_STRIDED: for.body.preheader: +; DISABLED_MASKED_STRIDED-NEXT: br i1 [[CMP15]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]] +; DISABLED_MASKED_STRIDED: vector.ph: ; DISABLED_MASKED_STRIDED-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[NUMPOINTS]] to i64 ; DISABLED_MASKED_STRIDED-NEXT: [[N_RND_UP:%.*]] = add nuw nsw i64 [[WIDE_TRIP_COUNT]], 3 ; DISABLED_MASKED_STRIDED-NEXT: [[N_VEC:%.*]] = and i64 [[N_RND_UP]], 8589934588 @@ -143,8 +143,8 @@ ; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; DISABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; DISABLED_MASKED_STRIDED: vector.body: -; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE15:%.*]] ] -; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[FOR_BODY_PREHEADER]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE15]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE15:%.*]] ] +; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE15]] ] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[INDEX]] ; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = bitcast i16* [[TMP1]] to <4 x i16>* @@ -238,8 +238,8 @@ ; ENABLED_MASKED_STRIDED-LABEL: @test2( ; ENABLED_MASKED_STRIDED-NEXT: entry: ; ENABLED_MASKED_STRIDED-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[NUMPOINTS:%.*]], 0 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] -; ENABLED_MASKED_STRIDED: for.body.preheader: +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[CMP15]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]] +; ENABLED_MASKED_STRIDED: vector.ph: ; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[NUMPOINTS]] to i64 ; ENABLED_MASKED_STRIDED-NEXT: [[N_RND_UP:%.*]] = add nuw nsw i64 [[WIDE_TRIP_COUNT]], 3 ; ENABLED_MASKED_STRIDED-NEXT: [[N_VEC:%.*]] = and i64 [[N_RND_UP]], 8589934588 @@ -249,11 +249,11 @@ ; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], i64 -1 ; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; ENABLED_MASKED_STRIDED: vector.body: -; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 ; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer -; ENABLED_MASKED_STRIDED-NEXT: [[INDUCTION:%.*]] = or <4 x i64> [[BROADCAST_SPLAT2]], -; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[INDUCTION]], [[BROADCAST_SPLAT]] +; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IV:%.*]] = or <4 x i64> [[BROADCAST_SPLAT2]], +; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[INDEX]] ; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP2]] to <4 x i16>* ; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP3]], i32 2, <4 x i1> [[TMP1]], <4 x i16> poison) Index: llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -4367,11 +4367,11 @@ ; ; CHECK-LABEL: @sink_into_replication_region( ; CHECK-NEXT: bb: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[Y:%.*]], 1 ; CHECK-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[Y]], i32 1) ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]] -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 3 ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 ; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[TMP1]], -1 @@ -4441,11 +4441,11 @@ ; ; UNROLL-LABEL: @sink_into_replication_region( ; UNROLL-NEXT: bb: +; UNROLL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; UNROLL: vector.ph: ; UNROLL-NEXT: [[TMP0:%.*]] = add i32 [[Y:%.*]], 1 ; UNROLL-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[Y]], i32 1) ; UNROLL-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]] -; UNROLL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; UNROLL: vector.ph: ; UNROLL-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 7 ; UNROLL-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 ; UNROLL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[TMP1]], -1 @@ -4864,11 +4864,11 @@ ; ; CHECK-LABEL: @sink_into_replication_region_multiple( ; CHECK-NEXT: bb: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[Y:%.*]], 1 ; CHECK-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[Y]], i32 1) ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]] -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 3 ; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 ; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[TMP1]], -1 @@ -4972,11 +4972,11 @@ ; ; UNROLL-LABEL: @sink_into_replication_region_multiple( ; UNROLL-NEXT: bb: +; UNROLL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; UNROLL: vector.ph: ; UNROLL-NEXT: [[TMP0:%.*]] = add i32 [[Y:%.*]], 1 ; UNROLL-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[Y]], i32 1) ; UNROLL-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]] -; UNROLL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; UNROLL: vector.ph: ; UNROLL-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 7 ; UNROLL-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 ; UNROLL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[TMP1]], -1 Index: llvm/test/Transforms/LoopVectorize/induction.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/induction.ll +++ llvm/test/Transforms/LoopVectorize/induction.ll @@ -1194,11 +1194,11 @@ ; INTERLEAVE-NEXT: entry: ; INTERLEAVE-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 8) ; INTERLEAVE-NEXT: [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1 -; INTERLEAVE-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 3 -; INTERLEAVE-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 ; INTERLEAVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 64 ; INTERLEAVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; INTERLEAVE: vector.ph: +; INTERLEAVE-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 3 +; INTERLEAVE-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 ; INTERLEAVE-NEXT: [[N_MOD_VF:%.*]] = and i64 [[TMP2]], 7 ; INTERLEAVE-NEXT: [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; INTERLEAVE-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 8, i64 [[N_MOD_VF]] Index: llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll +++ llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -115,11 +115,11 @@ ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[NEXT_GEP]] to <12 x i32>* ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[TMP1]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC2]], -; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], +; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC5]], +; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[STRIDED_VEC6]], ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 [[INDEX]], i32 2 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -2 ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <12 x i32>* @@ -496,11 +496,11 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 2) ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[UMAX]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[TMP2]], 3 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 4, i64 [[N_MOD_VF]] @@ -759,12 +759,12 @@ ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[NEXT_GEP]] to <12 x i32>* ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[TMP1]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_IND]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[NEXT_GEP]], i64 2 -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_IND]] -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[STRIDED_VEC3]], [[VEC_IND]] +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC5]], [[VEC_IND]] +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[STRIDED_VEC6]], [[VEC_IND]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i64 -2 ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <12 x i32>* ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> Index: llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll =================================================================== --- llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll +++ llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll @@ -299,22 +299,22 @@ define void @multiply_dont_hoist_phi(<4 x double>* noalias %A, <4 x double> * %B, [4 x double]* %C) { ; CHECK-LABEL: @multiply_dont_hoist_phi( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast <4 x double>* [[A:%.*]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST]], align 8 -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 2 +; CHECK-NEXT: br label [[NEXT:%.*]] +; CHECK: next: +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr <4 x double>, <4 x double>* [[A:%.*]], i64 0, i64 2 ; CHECK-NEXT: [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>* ; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST1]], align 8 -; CHECK-NEXT: [[VEC_CAST3:%.*]] = bitcast <4 x double>* [[B:%.*]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST3]], align 8 -; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr <4 x double>, <4 x double>* [[B]], i64 0, i64 2 +; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr <4 x double>, <4 x double>* [[B:%.*]], i64 0, i64 2 ; CHECK-NEXT: [[VEC_CAST6:%.*]] = bitcast double* [[VEC_GEP5]] to <2 x double>* ; CHECK-NEXT: [[COL_LOAD7:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST6]], align 8 -; CHECK-NEXT: br label [[NEXT:%.*]] -; CHECK: next: ; CHECK-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <2 x double> [[COL_LOAD7]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast <4 x double>* [[A]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST]], align 8 ; CHECK-NEXT: [[SPLAT_SPLAT13:%.*]] = shufflevector <2 x double> [[COL_LOAD7]], <2 x double> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP0:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT13]] ; CHECK-NEXT: [[TMP1:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT16]], <2 x double> [[TMP0]]) +; CHECK-NEXT: [[VEC_CAST3:%.*]] = bitcast <4 x double>* [[B]] to <2 x double>* +; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST3]], align 8 ; CHECK-NEXT: [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> undef, <2 x i32> ; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]] Index: llvm/test/Transforms/PGOProfile/chr.ll =================================================================== --- llvm/test/Transforms/PGOProfile/chr.ll +++ llvm/test/Transforms/PGOProfile/chr.ll @@ -1381,10 +1381,6 @@ ; CHECK-NEXT: br label [[BB1]] ; CHECK: bb1: ; CHECK-NEXT: [[J0:%.*]] = load i32, i32* [[J:%.*]], align 4 -; CHECK-NEXT: [[V6:%.*]] = and i32 [[I0]], 2 -; CHECK-NEXT: [[V4:%.*]] = icmp eq i32 [[V6]], [[J0]] -; CHECK-NEXT: [[V8:%.*]] = add i32 [[SUM0:%.*]], 43 -; CHECK-NEXT: [[SUM2:%.*]] = select i1 [[V4]], i32 [[SUM0]], i32 [[V8]], !prof [[PROF16]] ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: [[V9:%.*]] = and i32 [[I0]], 4 ; CHECK-NEXT: [[V10:%.*]] = icmp eq i32 [[V9]], 0 @@ -1393,6 +1389,10 @@ ; CHECK-NEXT: call void @foo() ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: +; CHECK-NEXT: [[V6:%.*]] = and i32 [[I0]], 2 +; CHECK-NEXT: [[V4:%.*]] = icmp eq i32 [[V6]], [[J0]] +; CHECK-NEXT: [[V8:%.*]] = add i32 [[SUM0:%.*]], 43 +; CHECK-NEXT: [[SUM2:%.*]] = select i1 [[V4]], i32 [[SUM0]], i32 [[V8]], !prof [[PROF16]] ; CHECK-NEXT: [[V5:%.*]] = icmp eq i32 [[I0]], [[SUM2]] ; CHECK-NEXT: [[SUM3:%.*]] = select i1 [[V5]], i32 [[SUM2]], i32 [[V8]], !prof [[PROF16]] ; CHECK-NEXT: [[V11:%.*]] = add i32 [[I0]], [[SUM3]]