Index: llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -49,6 +49,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -111,6 +112,7 @@ class Vectorizer { Function &F; AliasAnalysis &AA; + AssumptionCache &AC; DominatorTree &DT; ScalarEvolution &SE; TargetTransformInfo &TTI; @@ -118,9 +120,9 @@ IRBuilder<> Builder; public: - Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT, - ScalarEvolution &SE, TargetTransformInfo &TTI) - : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI), + Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC, + DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI) + : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI), DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {} bool run(); @@ -205,6 +207,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); @@ -219,6 +222,7 @@ INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE, "Vectorize load and Store instructions", false, false) INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker); INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) @@ -241,7 +245,10 @@ TargetTransformInfo &TTI = getAnalysis().getTTI(F); - Vectorizer V(F, AA, DT, SE, TTI); + AssumptionCache &AC = + getAnalysis().getAssumptionCache(F); + + Vectorizer V(F, AA, AC, DT, SE, TTI); return V.run(); } @@ -254,8 +261,9 @@ DominatorTree &DT = AM.getResult(F); ScalarEvolution &SE = AM.getResult(F); TargetTransformInfo &TTI = AM.getResult(F); + AssumptionCache &AC = AM.getResult(F); - Vectorizer V(F, AA, DT, SE, TTI); + Vectorizer V(F, AA, AC, DT, SE, TTI); bool Changed = V.run(); PreservedAnalyses PA; PA.preserveSet(); @@ -510,7 +518,7 @@ if (!OpA) return false; KnownBits Known(BitWidth); - computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT); + computeKnownBits(OpA, Known, DL, 0, &AC, OpA, &DT); APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth()); if (Signed) BitsAllowedToBeSet.clearBit(BitWidth - 1); Index: llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add.ll =================================================================== --- llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add.ll +++ llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add.ll @@ -104,8 +104,180 @@ ret void } +declare void @llvm.assume(i1) + +define void @ld_v4i8_add_known_bits(i32 %ind0, i32 %ind1, i8* %src, <4 x i8>* %dst) { +; CHECK-LABEL: @ld_v4i8_add_known_bits( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 4 +; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 4 +; CHECK-NEXT: [[TMP:%.*]] = add i32 [[V0]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[V1]], [[TMP]] +; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[TMP3]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] +; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <3 x i8>* +; CHECK-NEXT: [[TMP1:%.*]] = load <3 x i8>, <3 x i8>* [[TMP0]], align 1 +; CHECK-NEXT: [[TMP81:%.*]] = extractelement <3 x i8> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP132:%.*]] = extractelement <3 x i8> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP183:%.*]] = extractelement <3 x i8> [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3 +; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]] +; CHECK-NEXT: ret void +; +bb: + %v0 = mul i32 %ind0, 4 + %v1 = mul i32 %ind1, 4 + %tmp = add i32 %v0, -1 + %tmp1 = add i32 %v1, %tmp + %tmp2 = sext i32 %tmp1 to i64 + %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 + %tmp4 = load i8, i8* %tmp3, align 1 + %tmp5 = add i32 %v1, %v0 + %tmp6 = sext i32 %tmp5 to i64 + %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 + %tmp8 = load i8, i8* %tmp7, align 1 + %tmp9 = add i32 %v0, 1 + %tmp10 = add i32 %v1, %tmp9 + %tmp11 = sext i32 %tmp10 to i64 + %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11 + %tmp13 = load i8, i8* %tmp12, align 1 + %tmp14 = add i32 %v0, 2 + %tmp15 = add i32 %v1, %tmp14 + %tmp16 = sext i32 %tmp15 to i64 + %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16 + %tmp18 = load i8, i8* %tmp17, align 1 + %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 + %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 + %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 + %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 + store <4 x i8> %tmp22, <4 x i8>* %dst + ret void +} + +define void @ld_v4i8_add_known_bits1(i32 %ind0, i32 %ind1, i8* %src, <4 x i8>* %dst) { +; CHECK-LABEL: @ld_v4i8_add_known_bits1( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 4 +; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] +; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 +; CHECK-NEXT: [[TMP81:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP132:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP183:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP44]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3 +; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]] +; CHECK-NEXT: ret void +; +bb: + %v0 = mul i32 %ind0, 4 + %v1 = mul i32 %ind1, 4 + %tmp = add i32 %v0, 3 + %tmp1 = add i32 %v1, %tmp + %tmp2 = sext i32 %tmp1 to i64 + %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 + %tmp4 = load i8, i8* %tmp3, align 1 + %tmp5 = add i32 %v1, %v0 + %tmp6 = sext i32 %tmp5 to i64 + %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 + %tmp8 = load i8, i8* %tmp7, align 1 + %tmp9 = add i32 %v0, 1 + %tmp10 = add i32 %v1, %tmp9 + %tmp11 = sext i32 %tmp10 to i64 + %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11 + %tmp13 = load i8, i8* %tmp12, align 1 + %tmp14 = add i32 %v0, 2 + %tmp15 = add i32 %v1, %tmp14 + %tmp16 = sext i32 %tmp15 to i64 + %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16 + %tmp18 = load i8, i8* %tmp17, align 1 + %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 + %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 + %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 + %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 + store <4 x i8> %tmp22, <4 x i8>* %dst + ret void +} + +define void @ld_v4i8_add_known_bits_by_assume(i32 %ind0, i32 %ind1, i8* %src, <4 x i8>* %dst) { +; CHECK-LABEL: @ld_v4i8_add_known_bits_by_assume( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 3 +; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 3 +; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[V0]], 3 +; CHECK-NEXT: [[CMP_I:%.*]] = icmp eq i32 [[AND_I]], 0 +; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[V1]], 3 +; CHECK-NEXT: [[CMP_I_1:%.*]] = icmp eq i32 [[AND_I_1]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_I]]) +; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_I_1]]) +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] +; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 +; CHECK-NEXT: [[TMP81:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP132:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP183:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP44]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3 +; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]] +; CHECK-NEXT: ret void +; +bb: + %v0 = mul i32 %ind0, 3 + %v1 = mul i32 %ind1, 3 + %and.i = and i32 %v0, 3 + %cmp.i = icmp eq i32 %and.i, 0 + %and.i.1 = and i32 %v1, 3 + %cmp.i.1 = icmp eq i32 %and.i.1, 0 + call void @llvm.assume(i1 %cmp.i) + call void @llvm.assume(i1 %cmp.i.1) + %tmp = add i32 %v0, 3 + %tmp1 = add i32 %v1, %tmp + %tmp2 = sext i32 %tmp1 to i64 + %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 + %tmp4 = load i8, i8* %tmp3, align 1 + %tmp5 = add i32 %v1, %v0 + %tmp6 = sext i32 %tmp5 to i64 + %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 + %tmp8 = load i8, i8* %tmp7, align 1 + %tmp9 = add i32 %v0, 1 + %tmp10 = add i32 %v1, %tmp9 + %tmp11 = sext i32 %tmp10 to i64 + %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11 + %tmp13 = load i8, i8* %tmp12, align 1 + %tmp14 = add i32 %v0, 2 + %tmp15 = add i32 %v1, %tmp14 + %tmp16 = sext i32 %tmp15 to i64 + %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16 + %tmp18 = load i8, i8* %tmp17, align 1 + %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 + %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 + %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 + %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 + store <4 x i8> %tmp22, <4 x i8>* %dst + ret void +} + ; Make sure we don't vectorize the loads below because the source of -; sext instructions doesn't have the nsw flag. +; sext instructions doesn't have the nsw flag or known bits allowing +; to apply the vectorization. define void @ld_v4i8_add_not_safe(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) { ; CHECK-LABEL: @ld_v4i8_add_not_safe(