Index: lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -15,7 +15,9 @@ // #include "llvm/Transforms/Vectorize/LoopVectorize.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" +#include "llvm/Analysis/Loads.h" #include "llvm/Analysis/VectorUtils.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/IntrinsicInst.h" using namespace llvm; @@ -913,6 +915,49 @@ return true; } +/// Return true if we can that the given loan is safe to evaluate anywhere +/// within the loop body (i.e. does not require predication beyond that +/// required by the the header itself) +static bool isSafeToLoadUnconditionallyInLoop(LoadInst *LI, Loop *L, + const DataLayout &DL, + ScalarEvolution &SE, + DominatorTree &DT) { + Value *Ptr = LI->getPointerOperand(); + auto *AddRec = dyn_cast(SE.getSCEV(Ptr)); + if (!AddRec || AddRec->getLoop() != L || !AddRec->isAffine()) + return false; + auto* Step = dyn_cast(AddRec->getStepRecurrence(SE)); + if (!Step) + return false; + APInt StepC = Step->getAPInt(); + APInt AccessSize(DL.getIndexTypeSizeInBits(Ptr->getType()), + DL.getTypeStoreSize(LI->getType())); + // TODO: generalize to access patterns which don't touch every byte + if (StepC != AccessSize) + return false; + + auto TC = SE.getSmallConstantTripCount(L); + if (!TC) + return false; + + auto *StartS = dyn_cast(AddRec->getStart()); + if (!StartS || !SE.isLoopInvariant(StartS, L)) + return false; + + Value *Base = StartS->getValue(); + + // We have to adjust the size down by one iteration as the Loads.cpp adds one + // back. TODO: cleanup + APInt Size(DL.getIndexTypeSizeInBits(Ptr->getType()), + (TC-1)*DL.getTypeStoreSize(LI->getType())); + Instruction *HeaderFirstNonPHI = L->getHeader()->getFirstNonPHI(); + return isSafeToLoadUnconditionally(Base, + LI->getAlignment(), + Size, + DL, HeaderFirstNonPHI, + &DT); +} + bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (!EnableIfConversion) { reportVectorizationFailure("If-conversion is disabled", @@ -924,17 +969,55 @@ assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); - // A list of pointers that we can safely read and write to. + // A list of pointers which are known to be dereferenceable within scope of + // the loop body for each iteration of the loop. That is, the memory pointed + // to can be dereferenced (with the access size implied by the value's type) + // unconditionally within the loop loop header without introducing a new + // fault. SmallPtrSet SafePointes; // Collect safe addresses. for (BasicBlock *BB : TheLoop->blocks()) { - if (blockNeedsPredication(BB)) + if (blockNeedsPredication(BB)) { + // For a block which requires predication, a address may be safe to + // access in the loop w/o predication if we can prove dereferenceability + // facts sufficient to ensure it'll never fault within the loop. + // TODO: extend the load reasoning to stores, etc.. (requires some API + // cleanup and renaming in Loads.h) + for (Instruction &I : *BB) + if (auto *Ptr = getLoadStorePointerOperand(&I)) { + // The choice of header vs preheader is deliberate here to allow loop + // varying addresses which can still be proven. + Instruction *HeaderFirstNonPHI = + TheLoop->getHeader()->getFirstNonPHI(); + if (isSafeToSpeculativelyExecute(&I, HeaderFirstNonPHI, DT)) { + SafePointes.insert(Ptr); + continue; + } + // If the generic version didn't work, try to use SCEV to prove + // bounds on the access and discharge them against the allocation. + LoadInst *LI = dyn_cast(&I); + if (!LI) + continue; + if (!LI->isUnordered() || + // Speculative load may create a race that did not exist in the source. + LI->getFunction()->hasFnAttribute(Attribute::SanitizeThread) || + // Speculative load may load data from dirty regions. + LI->getFunction()->hasFnAttribute(Attribute::SanitizeAddress) || + LI->getFunction()->hasFnAttribute(Attribute::SanitizeHWAddress)) + continue; + auto &DL = LI->getModule()->getDataLayout(); + if (isSafeToLoadUnconditionallyInLoop(LI, TheLoop, DL, + *PSE.getSE(), *DT)) + SafePointes.insert(Ptr); + } continue; - + } + for (Instruction &I : *BB) if (auto *Ptr = getLoadStorePointerOperand(&I)) SafePointes.insert(Ptr); + continue; } // Collect the blocks that need predication. Index: test/Transforms/LoopVectorize/load-deref-pred.ll =================================================================== --- test/Transforms/LoopVectorize/load-deref-pred.ll +++ test/Transforms/LoopVectorize/load-deref-pred.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -force-vector-width=4 -loop-vectorize < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1" +target triple = "x86_64-unknown-linux-gnu" + +declare void @init(i32*) + +;; We don't need a masked.load for this due to deref facts, and can instead +;; use a plain vector load +define i32 @test_explicit_pred(i64 %len) { +; CHECK-LABEL: @test_explicit_pred( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4096 x i32] +; CHECK-NEXT: [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32* +; CHECK-NEXT: call void @init(i32* [[BASE]]) +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[LEN:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i64> undef, i64 [[LEN]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT3]], <4 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-NEXT: [[TMP8:%.*]] = icmp slt <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp slt <4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT4]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 4 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP8]], +; CHECK-NEXT: [[TMP17:%.*]] = xor <4 x i1> [[TMP9]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI6:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> [[WIDE_LOAD5]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP18]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] +; CHECK-NEXT: [[TMP19]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI6]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP19]], [[TMP18]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX7:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF8:%.*]] = shufflevector <4 x i32> [[BIN_RDX7]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX9:%.*]] = add <4 x i32> [[BIN_RDX7]], [[RDX_SHUF8]] +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[BIN_RDX9]], i32 0 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 +; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EARLYCND:%.*]] = icmp slt i64 [[IV]], [[LEN]] +; CHECK-NEXT: br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]] +; CHECK: pred: +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[IV]] +; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[ADDR]] +; CHECK-NEXT: br label [[LATCH]] +; CHECK: latch: +; CHECK-NEXT: [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ] +; CHECK-NEXT: [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[VAL_PHI]] +; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 +; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop !2 +; CHECK: loop_exit: +; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] +; +entry: + %alloca = alloca [4096 x i32] + %base = bitcast [4096 x i32]* %alloca to i32* + call void @init(i32* %base) + br label %loop +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ] + %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ] + %iv.next = add i64 %iv, 1 + %earlycnd = icmp slt i64 %iv, %len + br i1 %earlycnd, label %pred, label %latch +pred: + %addr = getelementptr inbounds i32, i32* %base, i64 %iv + %val = load i32, i32* %addr + br label %latch +latch: + %val.phi = phi i32 [0, %loop], [%val, %pred] + %accum.next = add i32 %accum, %val.phi + %exit = icmp ugt i64 %iv, 4094 + br i1 %exit, label %loop_exit, label %loop + +loop_exit: + ret i32 %accum.next +} +