Index: include/llvm/Analysis/ScalarEvolution.h =================================================================== --- include/llvm/Analysis/ScalarEvolution.h +++ include/llvm/Analysis/ScalarEvolution.h @@ -523,6 +523,10 @@ const SCEV *getConstant(Type *Ty, uint64_t V, bool isSigned = false); const SCEV *getTruncateExpr(const SCEV *Op, Type *Ty); const SCEV *getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth = 0); + /// Sink zext towards the leaves of SCEV more aggressively using KnownBits, + /// where the latter requires the original IR Value to be available. + const SCEV *getZeroExtendExprForValue(const Value *OpV, const SCEV *Op, + Type *Ty, unsigned Depth = 0); const SCEV *getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth = 0); const SCEV *getAnyExtendExpr(const SCEV *Op, Type *Ty); const SCEV *getAddExpr(SmallVectorImpl &Ops, Index: lib/Analysis/ScalarEvolution.cpp =================================================================== --- lib/Analysis/ScalarEvolution.cpp +++ lib/Analysis/ScalarEvolution.cpp @@ -1559,8 +1559,14 @@ return false; } -const SCEV * -ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) { +const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, + unsigned Depth) { + return getZeroExtendExprForValue(nullptr, Op, Ty, Depth); +} + +const SCEV *ScalarEvolution::getZeroExtendExprForValue(const Value *OpV, + const SCEV *Op, Type *Ty, + unsigned Depth) { assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) && "This is not an extending conversion!"); assert(isSCEVable(Ty) && @@ -1777,6 +1783,46 @@ Ops.push_back(getZeroExtendExpr(Op, Ty, Depth + 1)); return getAddExpr(Ops, SCEV::FlagNUW, Depth + 1); } + + // zext(C + x + y + ...) --> (zext(D) + zext((C - D) + x + y + ...)) + // if D + (C - D + x + y + ...) could be proven to not unsigned wrap + // where D is the maximum such D and D <= C (unsigned) + // + // Useful while proving that address arithmetic expressions are equal or + // differ by a small constant amount, see LoadStoreVectorizer pass. + if (const auto *SC = dyn_cast(SA->getOperand(0))) { + APInt MinValue = getUnsignedRangeMin(SA); + // Often address arithmetics contain expressions like + // (zext (add (shl X, C1), C2)), for instance, (zext (5 + (4 * X))). + // ConstantRange is unable to prove that it's possible to transform + // (5 + (4 * X)) to (1 + (4 + (4 * X))) w/o underflowing: + // + // | Expression | ConstantRange | KnownBits | + // |---------------|------------------------|-----------------------| + // | i8 4 * X | [L: 0, U: 253) | XXXX XX00 | + // | | => Min: 0, Max: 252 | => Min: 0, Max: 252 | + // | | | | + // | i8 4 * X + 5 | [L: 5, U: 2) (wrapped) | YYYY YY01 | + // | (101) | => Min: 0, Max: 255 | => Min: 1, Max: 253 | + // + // On the other hand, ConstantRange [L: 1, U: 3) degrades to + // KnownBits 0000 00XX, loosing Min: 1, Max: 2 to Min: 0, Max: 3. + // So use both if available: + if (OpV) { + const DataLayout &DL = getDataLayout(); + KnownBits Known = computeKnownBits(OpV, DL, 0, &AC, nullptr, &DT); + MinValue = Known.One.ugt(MinValue) ? Known.One : MinValue; + } + APInt C = SC->getAPInt(); + APInt D = MinValue.ugt(C) ? C : MinValue; + if (D != 0) { + const SCEV *SZExtD = getZeroExtendExpr(getConstant(D), Ty, Depth); + const SCEV *SResidual = + getAddExpr(getConstant(-D), SA, SCEV::FlagAnyWrap, Depth); + const SCEV *SZExtR = getZeroExtendExpr(SResidual, Ty, Depth + 1); + return getAddExpr(SZExtD, SZExtR, SCEV::FlagNUW, Depth + 1); + } + } } if (auto *SM = dyn_cast(Op)) { @@ -6337,8 +6383,10 @@ case Instruction::Trunc: return getTruncateExpr(getSCEV(U->getOperand(0)), U->getType()); - case Instruction::ZExt: - return getZeroExtendExpr(getSCEV(U->getOperand(0)), U->getType()); + case Instruction::ZExt: { + Value *OpV = U->getOperand(0); + return getZeroExtendExprForValue(OpV, getSCEV(OpV), U->getType()); + } case Instruction::SExt: if (auto BO = MatchBinaryOp(U->getOperand(0), DT)) { Index: test/Analysis/ScalarEvolution/no-wrap-add-exprs.ll =================================================================== --- test/Analysis/ScalarEvolution/no-wrap-add-exprs.ll +++ test/Analysis/ScalarEvolution/no-wrap-add-exprs.ll @@ -120,3 +120,84 @@ ret void } + +@z_addr = external global [16 x i8], align 4 +@z_addr_noalign = external global [16 x i8] + +%union = type { [10 x [4 x float]] } +@tmp_addr = external unnamed_addr global { %union, [2000 x i8] } + +define void @f3(i8* %x_addr, i8* %y_addr, i32* %tmp_addr) { +; CHECK-LABEL: Classifying expressions for: @f3 + entry: + %x = load i8, i8* %x_addr + %t0 = mul i8 %x, 4 + %t1 = add i8 %t0, 5 + %t1.zext = zext i8 %t1 to i16 +; CHECK: %t1.zext = zext i8 %t1 to i16 +; CHECK-NEXT: --> (1 + (zext i8 (4 + (4 * %x)) to i16)) U: [1,254) S: [1,257) + + %q0 = mul i8 %x, 4 + %q1 = add i8 %q0, 7 + %q1.zext = zext i8 %q1 to i16 +; CHECK: %q1.zext = zext i8 %q1 to i16 +; CHECK-NEXT: --> (3 + (zext i8 (4 + (4 * %x)) to i16)) U: [3,256) S: [3,259) + + %p0 = mul i8 %x, 4 + %p1 = add i8 %p0, 8 + %p1.zext = zext i8 %p1 to i16 +; CHECK: %p1.zext = zext i8 %p1 to i16 +; CHECK-NEXT: --> (zext i8 (8 + (4 * %x)) to i16) U: [0,253) S: [0,256) + + %r0 = mul i8 %x, 4 + %r1 = add i8 %r0, 254 + %r1.zext = zext i8 %r1 to i16 +; CHECK: %r1.zext = zext i8 %r1 to i16 +; CHECK-NEXT: --> (2 + (zext i8 (-4 + (4 * %x)) to i16)) U: [2,255) S: [2,258) + + %y = load i8, i8* %y_addr + %s0 = mul i8 %x, 32 + %s1 = mul i8 %y, 36 + %s2 = add i8 %s0, %s1 + %s3 = add i8 %s2, 5 + %s3.zext = zext i8 %s3 to i16 +; CHECK: %s3.zext = zext i8 %s3 to i16 +; CHECK-NEXT: --> (1 + (zext i8 (4 + (32 * %x) + (36 * %y)) to i16)) U: [1,254) S: [1,257) + + %ptr = bitcast [16 x i8]* @z_addr to i8* + %int0 = ptrtoint i8* %ptr to i32 + %int5 = add i32 %int0, 5 + %int.zext = zext i32 %int5 to i64 +; CHECK: %int.zext = zext i32 %int5 to i64 +; CHECK-NEXT: --> (1 + (zext i32 (4 + %int0) to i64)) U: [1,4294967294) S: [1,4294967297) + + %ptr_noalign = bitcast [16 x i8]* @z_addr_noalign to i8* + %int0_na = ptrtoint i8* %ptr_noalign to i32 + %int5_na = add i32 %int0_na, 5 + %int.zext_na = zext i32 %int5_na to i64 +; CHECK: %int.zext_na = zext i32 %int5_na to i64 +; CHECK-NEXT: --> (zext i32 (5 + %int0_na) to i64) U: [0,4294967296) S: [0,4294967296) + + %tmp = load i32, i32* %tmp_addr + %mul = and i32 %tmp, -4 + %add4 = add i32 %mul, 4 + %add4.zext = zext i32 %add4 to i64 + %sunkaddr3 = mul i64 %add4.zext, 4 + %sunkaddr4 = getelementptr inbounds i8, i8* bitcast ({ %union, [2000 x i8] }* @tmp_addr to i8*), i64 %sunkaddr3 + %sunkaddr5 = getelementptr inbounds i8, i8* %sunkaddr4, i64 4096 + %addr4.cast = bitcast i8* %sunkaddr5 to i32* + %addr4.incr = getelementptr i32, i32* %addr4.cast, i64 1 +; CHECK: %addr4.incr = getelementptr i32, i32* %addr4.cast, i64 1 +; CHECK-NEXT: --> ([[C:4100]] + ([[SIZE:4]] * (zext i32 ([[OFFSET:4]] + ([[STRIDE:4]] * (%tmp /u [[STRIDE]]))) to i64)) + @tmp_addr) + + %add5 = add i32 %mul, 5 + %add5.zext = zext i32 %add5 to i64 + %sunkaddr0 = mul i64 %add5.zext, 4 + %sunkaddr1 = getelementptr inbounds i8, i8* bitcast ({ %union, [2000 x i8] }* @tmp_addr to i8*), i64 %sunkaddr0 + %sunkaddr2 = getelementptr inbounds i8, i8* %sunkaddr1, i64 4096 + %addr5.cast = bitcast i8* %sunkaddr2 to i32* +; CHECK: %addr5.cast = bitcast i8* %sunkaddr2 to i32* +; CHECK-NEXT: --> ([[C]] + ([[SIZE]] * (zext i32 ([[OFFSET]] + ([[STRIDE]] * (%tmp /u [[STRIDE]]))) to i64)) + @tmp_addr) + + ret void +} Index: test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll =================================================================== --- /dev/null +++ test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll @@ -0,0 +1,78 @@ +; RUN: opt -codegenprepare -load-store-vectorizer %s -S -o - | FileCheck %s +; RUN: opt -load-store-vectorizer %s -S -o - | FileCheck %s + +target triple = "x86_64--" + +%union = type { { [4 x [4 x [4 x [16 x float]]]], [4 x [4 x [4 x [16 x float]]]], [10 x [10 x [4 x float]]] } } + +@global_pointer = external unnamed_addr global { %union, [2000 x i8] }, align 4 + +; Function Attrs: convergent nounwind +define void @test(i32 %base) #0 { +; CHECK-LABEL: @test( +; CHECK-NOT: load i32 +; CHECK: load <2 x i32> +; CHECK-NOT: load i32 +entry: + %mul331 = and i32 %base, -4 + %add350.4 = add i32 4, %mul331 + %idx351.4 = zext i32 %add350.4 to i64 + %arrayidx352.4 = getelementptr inbounds { %union, [2000 x i8] }, { %union, [2000 x i8] }* @global_pointer, i64 0, i32 0, i32 0, i32 1, i64 0, i64 0, i64 0, i64 %idx351.4 + %tmp296.4 = bitcast float* %arrayidx352.4 to i32* + %add350.5 = add i32 5, %mul331 + %idx351.5 = zext i32 %add350.5 to i64 + %arrayidx352.5 = getelementptr inbounds { %union, [2000 x i8] }, { %union, [2000 x i8] }* @global_pointer, i64 0, i32 0, i32 0, i32 1, i64 0, i64 0, i64 0, i64 %idx351.5 + %tmp296.5 = bitcast float* %arrayidx352.5 to i32* + %cnd = icmp ult i32 %base, 1000 + br i1 %cnd, label %loads, label %exit + +loads: + ; If and only if the loads are in a different BB from the GEPs codegenprepare + ; would try to turn the GEPs into math, which makes LoadStoreVectorizer's job + ; harder + %tmp297.4 = load i32, i32* %tmp296.4, align 4, !tbaa !0 + %tmp297.5 = load i32, i32* %tmp296.5, align 4, !tbaa !0 + br label %exit + +exit: + ret void +} + +; Function Attrs: convergent nounwind +define void @test.codegenprepared(i32 %base) #0 { +; CHECK-LABEL: @test.codegenprepared( +; CHECK-NOT: load i32 +; CHECK: load <2 x i32> +; CHECK-NOT: load i32 +entry: + %mul331 = and i32 %base, -4 + %add350.4 = add i32 4, %mul331 + %idx351.4 = zext i32 %add350.4 to i64 + %add350.5 = add i32 5, %mul331 + %idx351.5 = zext i32 %add350.5 to i64 + %cnd = icmp ult i32 %base, 1000 + br i1 %cnd, label %loads, label %exit + +loads: ; preds = %entry + %sunkaddr = mul i64 %idx351.4, 4 + %sunkaddr1 = getelementptr inbounds i8, i8* bitcast ({ %union, [2000 x i8] }* @global_pointer to i8*), i64 %sunkaddr + %sunkaddr2 = getelementptr inbounds i8, i8* %sunkaddr1, i64 4096 + %0 = bitcast i8* %sunkaddr2 to i32* + %tmp297.4 = load i32, i32* %0, align 4, !tbaa !0 + %sunkaddr3 = mul i64 %idx351.5, 4 + %sunkaddr4 = getelementptr inbounds i8, i8* bitcast ({ %union, [2000 x i8] }* @global_pointer to i8*), i64 %sunkaddr3 + %sunkaddr5 = getelementptr inbounds i8, i8* %sunkaddr4, i64 4096 + %1 = bitcast i8* %sunkaddr5 to i32* + %tmp297.5 = load i32, i32* %1, align 4, !tbaa !0 + br label %exit + +exit: ; preds = %loads, %entry + ret void +} + +attributes #0 = { convergent nounwind } + +!0 = !{!1, !1, i64 0} +!1 = !{!"float", !2, i64 0} +!2 = !{!"omnipotent char", !3, i64 0} +!3 = !{!"Simple C++ TBAA"}