diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -10052,23 +10052,48 @@ bool ScalarEvolution::isKnownPredicateViaNoOverflow(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) { - // Match Result to (X + Y) where Y is a constant integer. - // Return Y via OutY. - auto MatchBinaryAddToConst = - [this](const SCEV *Result, const SCEV *X, APInt &OutY, - SCEV::NoWrapFlags ExpectedFlags) { - const SCEV *NonConstOp, *ConstOp; - SCEV::NoWrapFlags FlagsPresent; - - if (!splitBinaryAdd(Result, ConstOp, NonConstOp, FlagsPresent) || - !isa(ConstOp) || NonConstOp != X) + // Match X to (A + C1) and Y to (A + C2), where + // C1 and C2 are constant integers. If either X or Y are not add expressions, + // consider them as X + 0 and Y + 0 respectively. C1 and C2 are returned via + // OutC1 and OutC2. + auto MatchBinaryAddToConst = [this](const SCEV *X, const SCEV *Y, + APInt &OutC1, APInt &OutC2, + SCEV::NoWrapFlags ExpectedFlags) { + const SCEV *XNonConstOp, *XConstOp; + const SCEV *YNonConstOp, *YConstOp; + SCEV::NoWrapFlags XFlagsPresent; + SCEV::NoWrapFlags YFlagsPresent; + + if (!splitBinaryAdd(X, XConstOp, XNonConstOp, XFlagsPresent)) { + XConstOp = getZero(X->getType()); + XNonConstOp = X; + XFlagsPresent = ExpectedFlags; + } + if (!isa(XConstOp) || + (XFlagsPresent & ExpectedFlags) != ExpectedFlags) return false; - OutY = cast(ConstOp)->getAPInt(); - return (FlagsPresent & ExpectedFlags) == ExpectedFlags; + if (!splitBinaryAdd(Y, YConstOp, YNonConstOp, YFlagsPresent)) { + YConstOp = getZero(Y->getType()); + YNonConstOp = Y; + YFlagsPresent = ExpectedFlags; + } + + if (!isa(YConstOp) || + (YFlagsPresent & ExpectedFlags) != ExpectedFlags) + return false; + + if (YNonConstOp != XNonConstOp) + return false; + + OutC1 = cast(XConstOp)->getAPInt(); + OutC2 = cast(YConstOp)->getAPInt(); + + return true; }; - APInt C; + APInt C1; + APInt C2; switch (Pred) { default: @@ -10078,45 +10103,38 @@ std::swap(LHS, RHS); LLVM_FALLTHROUGH; case ICmpInst::ICMP_SLE: - // X s<= (X + C) if C >= 0 - if (MatchBinaryAddToConst(RHS, LHS, C, SCEV::FlagNSW) && C.isNonNegative()) + // (X + C1) s<= (X + C2) if C1 s<= C2. + if (MatchBinaryAddToConst(LHS, RHS, C1, C2, SCEV::FlagNSW) && C1.sle(C2)) return true; - // (X + C) s<= X if C <= 0 - if (MatchBinaryAddToConst(LHS, RHS, C, SCEV::FlagNSW) && - !C.isStrictlyPositive()) - return true; break; case ICmpInst::ICMP_SGT: std::swap(LHS, RHS); LLVM_FALLTHROUGH; case ICmpInst::ICMP_SLT: - // X s< (X + C) if C > 0 - if (MatchBinaryAddToConst(RHS, LHS, C, SCEV::FlagNSW) && - C.isStrictlyPositive()) + // (X + C1) s< (X + C2) if C1 s< C2. + if (MatchBinaryAddToConst(LHS, RHS, C1, C2, SCEV::FlagNSW) && C1.slt(C2)) return true; - // (X + C) s< X if C < 0 - if (MatchBinaryAddToConst(LHS, RHS, C, SCEV::FlagNSW) && C.isNegative()) - return true; break; case ICmpInst::ICMP_UGE: std::swap(LHS, RHS); LLVM_FALLTHROUGH; case ICmpInst::ICMP_ULE: - // X u<= (X + C) for any C - if (MatchBinaryAddToConst(RHS, LHS, C, SCEV::FlagNUW)) + // (X + C1) u<= (X + C2) for C1 u<= C2. + if (MatchBinaryAddToConst(RHS, LHS, C2, C1, SCEV::FlagNUW) && C1.ule(C2)) return true; + break; case ICmpInst::ICMP_UGT: std::swap(LHS, RHS); LLVM_FALLTHROUGH; case ICmpInst::ICMP_ULT: - // X u< (X + C) if C != 0 - if (MatchBinaryAddToConst(RHS, LHS, C, SCEV::FlagNUW) && !C.isNullValue()) + // (X + C1) u< (X + C2) if C1 u< C2. + if (MatchBinaryAddToConst(RHS, LHS, C2, C1, SCEV::FlagNUW) && C1.ult(C2)) return true; break; } diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll @@ -40,13 +40,30 @@ ; CHECK-NEXT: entry.slpmemcheck: ; CHECK-NEXT: [[DST16:%.*]] = bitcast i32* [[DST:%.*]] to i8* ; CHECK-NEXT: [[SRC18:%.*]] = bitcast i32* [[SRC:%.*]] to i8* -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[DST]], i64 2 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[DST]], i64 3 ; CHECK-NEXT: [[SCEVGEP17:%.*]] = bitcast i32* [[SCEVGEP]] to i8* ; CHECK-NEXT: [[SCEVGEP19:%.*]] = getelementptr i32, i32* [[SRC]], i64 3 ; CHECK-NEXT: [[SCEVGEP1920:%.*]] = bitcast i32* [[SCEVGEP19]] to i8* ; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[DST16]], [[SCEVGEP1920]] ; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SRC18]], [[SCEVGEP17]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED:%.*]] +; CHECK: entry.slpversioned: +; CHECK-NEXT: [[SRC_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 1 +; CHECK-NEXT: [[DST_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 1 +; CHECK-NEXT: [[SRC_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 +; CHECK-NEXT: [[DST_GEP_2:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 +; CHECK-NEXT: [[SRC_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, !alias.scope !0, !noalias !3 +; CHECK-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], +; CHECK-NEXT: [[DST_GEP_3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: br label [[ENTRY_MERGE:%.*]] +; CHECK: entry.merge: +; CHECK-NEXT: ret void +; CHECK: entry.scalar: ; CHECK-NEXT: [[SRC_02:%.*]] = load i32, i32* [[SRC]], align 4 ; CHECK-NEXT: [[R_03:%.*]] = ashr i32 [[SRC_02]], 16 ; CHECK-NEXT: store i32 [[R_03]], i32* [[DST]], align 4 @@ -65,7 +82,7 @@ ; CHECK-NEXT: [[R_314:%.*]] = ashr i32 [[SRC_313]], 16 ; CHECK-NEXT: [[DST_GEP_315:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 ; CHECK-NEXT: store i32 [[R_314]], i32* [[DST_GEP_315]], align 4 -; CHECK-NEXT: ret void +; CHECK-NEXT: br label [[ENTRY_MERGE]] ; entry: %src.0 = load i32, i32* %src, align 4 @@ -120,7 +137,7 @@ ; CHECK-NEXT: entry.slpmemcheck: ; CHECK-NEXT: [[OUT_BLOCK12:%.*]] = bitcast i32* [[OUT_BLOCK:%.*]] to i8* ; CHECK-NEXT: [[COUNTER14:%.*]] = bitcast i32* [[COUNTER:%.*]] to i8* -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[OUT_BLOCK]], i64 2 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[OUT_BLOCK]], i64 3 ; CHECK-NEXT: [[SCEVGEP13:%.*]] = bitcast i32* [[SCEVGEP]] to i8* ; CHECK-NEXT: [[SCEVGEP15:%.*]] = getelementptr i32, i32* [[COUNTER]], i64 3 ; CHECK-NEXT: [[SCEVGEP1516:%.*]] = bitcast i32* [[SCEVGEP15]] to i8* @@ -129,52 +146,44 @@ ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED:%.*]] ; CHECK: entry.slpversioned: -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[COUNTER]], align 4, !alias.scope !0, !noalias !3 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[OUT_BLOCK]], align 4, !alias.scope !3, !noalias !0 -; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP1]], [[TMP0]] -; CHECK-NEXT: store i32 [[XOR]], i32* [[OUT_BLOCK]], align 4, !alias.scope !3, !noalias !0 ; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 ; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 1 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX2_1]], align 4 -; CHECK-NEXT: [[XOR_1:%.*]] = xor i32 [[TMP3]], [[TMP2]] -; CHECK-NEXT: store i32 [[XOR_1]], i32* [[ARRAYIDX2_1]], align 4 ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 2 ; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 2 ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 3 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[ARRAYIDX_2]] to <2 x i32>* -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[COUNTER]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, !alias.scope !5, !noalias !8 ; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 3 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[ARRAYIDX2_2]] to <2 x i32>* -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[TMP7]], [[TMP5]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[ARRAYIDX2_2]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP8]], <2 x i32>* [[TMP9]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4, !alias.scope !8, !noalias !5 +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i32> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4, !alias.scope !8, !noalias !5 ; CHECK-NEXT: br label [[ENTRY_MERGE:%.*]] ; CHECK: entry.merge: ; CHECK-NEXT: ret void ; CHECK: entry.scalar: -; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[COUNTER]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[OUT_BLOCK]], align 4 -; CHECK-NEXT: [[XOR2:%.*]] = xor i32 [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[COUNTER]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[OUT_BLOCK]], align 4 +; CHECK-NEXT: [[XOR2:%.*]] = xor i32 [[TMP7]], [[TMP6]] ; CHECK-NEXT: store i32 [[XOR2]], i32* [[OUT_BLOCK]], align 4 ; CHECK-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 1 -; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX_13]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX_13]], align 4 ; CHECK-NEXT: [[ARRAYIDX2_14:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 1 -; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX2_14]], align 4 -; CHECK-NEXT: [[XOR_15:%.*]] = xor i32 [[TMP13]], [[TMP12]] +; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX2_14]], align 4 +; CHECK-NEXT: [[XOR_15:%.*]] = xor i32 [[TMP9]], [[TMP8]] ; CHECK-NEXT: store i32 [[XOR_15]], i32* [[ARRAYIDX2_14]], align 4 ; CHECK-NEXT: [[ARRAYIDX_26:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 2 -; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX_26]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX_26]], align 4 ; CHECK-NEXT: [[ARRAYIDX2_27:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 2 -; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX2_27]], align 4 -; CHECK-NEXT: [[XOR_28:%.*]] = xor i32 [[TMP15]], [[TMP14]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX2_27]], align 4 +; CHECK-NEXT: [[XOR_28:%.*]] = xor i32 [[TMP11]], [[TMP10]] ; CHECK-NEXT: store i32 [[XOR_28]], i32* [[ARRAYIDX2_27]], align 4 ; CHECK-NEXT: [[ARRAYIDX_39:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 3 -; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX_39]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX_39]], align 4 ; CHECK-NEXT: [[ARRAYIDX2_310:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 3 -; CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[ARRAYIDX2_310]], align 4 -; CHECK-NEXT: [[XOR_311:%.*]] = xor i32 [[TMP17]], [[TMP16]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX2_310]], align 4 +; CHECK-NEXT: [[XOR_311:%.*]] = xor i32 [[TMP13]], [[TMP12]] ; CHECK-NEXT: store i32 [[XOR_311]], i32* [[ARRAYIDX2_310]], align 4 ; CHECK-NEXT: br label [[ENTRY_MERGE]] ; @@ -351,10 +360,12 @@ ; CHECK-NEXT: bb.slpmemcheck: ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 4 ; CHECK-NEXT: [[SCEVGEP6:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 4 +; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[A]], i64 5 ; CHECK-NEXT: [[SCEVGEP78:%.*]] = bitcast i32* [[SCEVGEP7]] to i8* -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP6]], [[SCEVGEP78]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP78]], [[SCEVGEP6]] +; CHECK-NEXT: [[SCEVGEP9:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 4 +; CHECK-NEXT: [[SCEVGEP910:%.*]] = bitcast i32* [[SCEVGEP9]] to i8* +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP6]], [[SCEVGEP910]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP910]], [[SCEVGEP78]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 4 ; CHECK-NEXT: store i32 0, i32* [[TMP2]], align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll @@ -6,36 +6,55 @@ ; CHECK-NEXT: entry.slpmemcheck: ; CHECK-NEXT: [[OUT_BLOCK12:%.*]] = bitcast i32* [[OUT_BLOCK:%.*]] to i8* ; CHECK-NEXT: [[COUNTER14:%.*]] = bitcast i32* [[COUNTER:%.*]] to i8* -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[OUT_BLOCK]], i64 1 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[OUT_BLOCK]], i64 3 ; CHECK-NEXT: [[SCEVGEP13:%.*]] = bitcast i32* [[SCEVGEP]] to i8* -; CHECK-NEXT: [[SCEVGEP15:%.*]] = getelementptr i32, i32* [[COUNTER]], i64 1 +; CHECK-NEXT: [[SCEVGEP15:%.*]] = getelementptr i32, i32* [[COUNTER]], i64 3 ; CHECK-NEXT: [[SCEVGEP1516:%.*]] = bitcast i32* [[SCEVGEP15]] to i8* ; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[OUT_BLOCK12]], [[SCEVGEP1516]] ; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[COUNTER14]], [[SCEVGEP13]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[COUNTER]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[OUT_BLOCK]], align 4 -; CHECK-NEXT: [[XOR2:%.*]] = xor i32 [[TMP1]], [[TMP0]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[ENTRY_SCALAR:%.*]], label [[ENTRY_SLPVERSIONED:%.*]] +; CHECK: entry.slpversioned: +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 1 +; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 1 +; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 2 +; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 2 +; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 3 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[COUNTER]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, !alias.scope !0, !noalias !3 +; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i32> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[OUT_BLOCK]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: br label [[ENTRY_MERGE:%.*]] +; CHECK: entry.merge: +; CHECK-NEXT: ret void +; CHECK: entry.scalar: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[COUNTER]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[OUT_BLOCK]], align 4 +; CHECK-NEXT: [[XOR2:%.*]] = xor i32 [[TMP7]], [[TMP6]] ; CHECK-NEXT: store i32 [[XOR2]], i32* [[OUT_BLOCK]], align 4 ; CHECK-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_13]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX_13]], align 4 ; CHECK-NEXT: [[ARRAYIDX2_14:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 1 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX2_14]], align 4 -; CHECK-NEXT: [[XOR_15:%.*]] = xor i32 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX2_14]], align 4 +; CHECK-NEXT: [[XOR_15:%.*]] = xor i32 [[TMP9]], [[TMP8]] ; CHECK-NEXT: store i32 [[XOR_15]], i32* [[ARRAYIDX2_14]], align 4 ; CHECK-NEXT: [[ARRAYIDX_26:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 2 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_26]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX_26]], align 4 ; CHECK-NEXT: [[ARRAYIDX2_27:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 2 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX2_27]], align 4 -; CHECK-NEXT: [[XOR_28:%.*]] = xor i32 [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX2_27]], align 4 +; CHECK-NEXT: [[XOR_28:%.*]] = xor i32 [[TMP11]], [[TMP10]] ; CHECK-NEXT: store i32 [[XOR_28]], i32* [[ARRAYIDX2_27]], align 4 ; CHECK-NEXT: [[ARRAYIDX_39:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 3 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_39]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX_39]], align 4 ; CHECK-NEXT: [[ARRAYIDX2_310:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 3 -; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX2_310]], align 4 -; CHECK-NEXT: [[XOR_311:%.*]] = xor i32 [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX2_310]], align 4 +; CHECK-NEXT: [[XOR_311:%.*]] = xor i32 [[TMP13]], [[TMP12]] ; CHECK-NEXT: store i32 [[XOR_311]], i32* [[ARRAYIDX2_310]], align 4 -; CHECK-NEXT: ret void +; CHECK-NEXT: br label [[ENTRY_MERGE]] ; entry: %0 = load i32, i32* %counter, align 4 @@ -79,12 +98,14 @@ ; CHECK-NEXT: call void @use(<8 x float> [[I71]]) ; CHECK-NEXT: ret void ; CHECK: then.slpmemcheck: -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 8 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 5 ; CHECK-NEXT: [[SCEVGEP8:%.*]] = bitcast float* [[SCEVGEP]] to i8* -; CHECK-NEXT: [[SCEVGEP9:%.*]] = getelementptr float, float* [[B]], i64 14 +; CHECK-NEXT: [[SCEVGEP9:%.*]] = getelementptr float, float* [[A]], i64 8 ; CHECK-NEXT: [[SCEVGEP910:%.*]] = bitcast float* [[SCEVGEP9]] to i8* -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP8]], [[SCEVGEP910]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP910]], [[SCEVGEP8]] +; CHECK-NEXT: [[SCEVGEP11:%.*]] = getelementptr float, float* [[B]], i64 14 +; CHECK-NEXT: [[SCEVGEP1112:%.*]] = bitcast float* [[SCEVGEP11]] to i8* +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP8]], [[SCEVGEP1112]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP1112]], [[SCEVGEP910]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: [[A_83:%.*]] = getelementptr inbounds float, float* [[A]], i64 8 ; CHECK-NEXT: store float 0.000000e+00, float* [[A_83]], align 4