Index: lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -15,7 +15,9 @@ // #include "llvm/Transforms/Vectorize/LoopVectorize.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" +#include "llvm/Analysis/Loads.h" #include "llvm/Analysis/VectorUtils.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/IntrinsicInst.h" using namespace llvm; @@ -913,6 +915,49 @@ return true; } +/// Return true if we can that the given loan is safe to evaluate anywhere +/// within the loop body (i.e. does not require predication beyond that +/// required by the the header itself) +static bool isSafeToLoadUnconditionallyInLoop(LoadInst *LI, Loop *L, + const DataLayout &DL, + ScalarEvolution &SE, + DominatorTree &DT) { + Value *Ptr = LI->getPointerOperand(); + auto *AddRec = dyn_cast(SE.getSCEV(Ptr)); + if (!AddRec || AddRec->getLoop() != L || !AddRec->isAffine()) + return false; + auto* Step = dyn_cast(AddRec->getStepRecurrence(SE)); + if (!Step) + return false; + APInt StepC = Step->getAPInt(); + APInt AccessSize(DL.getIndexTypeSizeInBits(Ptr->getType()), + DL.getTypeStoreSize(LI->getType())); + // TODO: generalize to access patterns which don't touch every byte + if (StepC != AccessSize) + return false; + + auto TC = SE.getSmallConstantTripCount(L); + if (!TC) + return false; + + auto *StartS = dyn_cast(AddRec->getStart()); + if (!StartS || !SE.isLoopInvariant(StartS, L)) + return false; + + Value *Base = StartS->getValue(); + + // We have to adjust the size down by one iteration as the Loads.cpp adds one + // back. TODO: cleanup + APInt Size(DL.getIndexTypeSizeInBits(Ptr->getType()), + (TC-1)*DL.getTypeStoreSize(LI->getType())); + Instruction *HeaderFirstNonPHI = L->getHeader()->getFirstNonPHI(); + return isSafeToLoadUnconditionally(Base, + LI->getAlignment(), + Size, + DL, HeaderFirstNonPHI, + &DT); +} + bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (!EnableIfConversion) { reportVectorizationFailure("If-conversion is disabled", @@ -924,17 +969,55 @@ assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); - // A list of pointers that we can safely read and write to. + // A list of pointers which are known to be dereferenceable within scope of + // the loop body for each iteration of the loop. That is, the memory pointed + // to can be dereferenced (with the access size implied by the value's type) + // unconditionally within the loop loop header without introducing a new + // fault. SmallPtrSet SafePointes; // Collect safe addresses. for (BasicBlock *BB : TheLoop->blocks()) { - if (blockNeedsPredication(BB)) + if (blockNeedsPredication(BB)) { + // For a block which requires predication, a address may be safe to + // access in the loop w/o predication if we can prove dereferenceability + // facts sufficient to ensure it'll never fault within the loop. + // TODO: extend the load reasoning to stores, etc.. (requires some API + // cleanup and renaming in Loads.h) + for (Instruction &I : *BB) + if (auto *Ptr = getLoadStorePointerOperand(&I)) { + // The choice of header vs preheader is deliberate here to allow loop + // varying addresses which can still be proven. + Instruction *HeaderFirstNonPHI = + TheLoop->getHeader()->getFirstNonPHI(); + if (isSafeToSpeculativelyExecute(&I, HeaderFirstNonPHI, DT)) { + SafePointes.insert(Ptr); + continue; + } + // If the generic version didn't work, try to use SCEV to prove + // bounds on the access and discharge them against the allocation. + LoadInst *LI = dyn_cast(&I); + if (!LI) + continue; + if (!LI->isUnordered() || + // Speculative load may create a race that did not exist in the source. + LI->getFunction()->hasFnAttribute(Attribute::SanitizeThread) || + // Speculative load may load data from dirty regions. + LI->getFunction()->hasFnAttribute(Attribute::SanitizeAddress) || + LI->getFunction()->hasFnAttribute(Attribute::SanitizeHWAddress)) + continue; + auto &DL = LI->getModule()->getDataLayout(); + if (isSafeToLoadUnconditionallyInLoop(LI, TheLoop, DL, + *PSE.getSE(), *DT)) + SafePointes.insert(Ptr); + } continue; - + } + for (Instruction &I : *BB) if (auto *Ptr = getLoadStorePointerOperand(&I)) SafePointes.insert(Ptr); + continue; } // Collect the blocks that need predication. Index: test/Transforms/LoopVectorize/X86/load-deref-pred.ll =================================================================== --- test/Transforms/LoopVectorize/X86/load-deref-pred.ll +++ test/Transforms/LoopVectorize/X86/load-deref-pred.ll @@ -67,24 +67,24 @@ ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0 ; CHECK-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP25]], i32 4, <4 x i1> [[TMP16]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP25]], align 4 ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 4 ; CHECK-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP27]], i32 4, <4 x i1> [[TMP17]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP27]], align 4 ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8 ; CHECK-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP29]], i32 4, <4 x i1> [[TMP18]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4 ; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 12 ; CHECK-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP31]], i32 4, <4 x i1> [[TMP19]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP31]], align 4 ; CHECK-NEXT: [[TMP32:%.*]] = xor <4 x i1> [[TMP16]], ; CHECK-NEXT: [[TMP33:%.*]] = xor <4 x i1> [[TMP17]], ; CHECK-NEXT: [[TMP34:%.*]] = xor <4 x i1> [[TMP18]], ; CHECK-NEXT: [[TMP35:%.*]] = xor <4 x i1> [[TMP19]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP16]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI16:%.*]] = select <4 x i1> [[TMP17]], <4 x i32> [[WIDE_MASKED_LOAD13]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI17:%.*]] = select <4 x i1> [[TMP18]], <4 x i32> [[WIDE_MASKED_LOAD14]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI18:%.*]] = select <4 x i1> [[TMP19]], <4 x i32> [[WIDE_MASKED_LOAD15]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP16]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI16:%.*]] = select <4 x i1> [[TMP17]], <4 x i32> [[WIDE_LOAD13]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI17:%.*]] = select <4 x i1> [[TMP18]], <4 x i32> [[WIDE_LOAD14]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI18:%.*]] = select <4 x i1> [[TMP19]], <4 x i32> [[WIDE_LOAD15]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP36]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[TMP37]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI16]] ; CHECK-NEXT: [[TMP38]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI17]] @@ -244,24 +244,24 @@ ; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0 ; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 ; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4 ; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP71]], align 4 ; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8 ; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP73]], align 4 ; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12 ; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP75]], align 4 ; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], ; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], ; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], ; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI10:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD7]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI11:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD8]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI12:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD9]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI10:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_LOAD7]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI11:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_LOAD8]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI12:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_LOAD9]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI10]] ; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI11]] @@ -960,24 +960,24 @@ ; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0 ; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 ; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4 ; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP71]], align 4 ; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8 ; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP73]], align 4 ; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12 ; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP75]], align 4 ; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], ; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], ; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], ; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI10:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD7]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI11:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD8]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI12:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD9]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI10:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_LOAD7]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI11:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_LOAD8]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI12:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_LOAD9]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI10]] ; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI11]] @@ -1137,24 +1137,24 @@ ; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 0 ; CHECK-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP69]], i32 4, <4 x i1> [[TMP39]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4 ; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 4 ; CHECK-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP71]], i32 4, <4 x i1> [[TMP47]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP71]], align 4 ; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 8 ; CHECK-NEXT: [[TMP73:%.*]] = bitcast i32* [[TMP72]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP73]], i32 4, <4 x i1> [[TMP55]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP73]], align 4 ; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[TMP64]], i32 12 ; CHECK-NEXT: [[TMP75:%.*]] = bitcast i32* [[TMP74]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP75]], i32 4, <4 x i1> [[TMP63]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP75]], align 4 ; CHECK-NEXT: [[TMP76:%.*]] = xor <4 x i1> [[TMP39]], ; CHECK-NEXT: [[TMP77:%.*]] = xor <4 x i1> [[TMP47]], ; CHECK-NEXT: [[TMP78:%.*]] = xor <4 x i1> [[TMP55]], ; CHECK-NEXT: [[TMP79:%.*]] = xor <4 x i1> [[TMP63]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI10:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_MASKED_LOAD7]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI11:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_MASKED_LOAD8]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI12:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_MASKED_LOAD9]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI10:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_LOAD7]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI11:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[WIDE_LOAD8]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI12:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[WIDE_LOAD9]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP80]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[TMP81]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI10]] ; CHECK-NEXT: [[TMP82]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI11]] @@ -1300,24 +1300,24 @@ ; CHECK-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[BASE]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[TMP52]], i32 0 ; CHECK-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP57]], i32 4, <4 x i1> [[TMP27]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP57]], align 4 ; CHECK-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[TMP52]], i32 4 ; CHECK-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP59]], i32 4, <4 x i1> [[TMP35]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4 ; CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TMP52]], i32 8 ; CHECK-NEXT: [[TMP61:%.*]] = bitcast i32* [[TMP60]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP61]], i32 4, <4 x i1> [[TMP43]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP61]], align 4 ; CHECK-NEXT: [[TMP62:%.*]] = getelementptr inbounds i32, i32* [[TMP52]], i32 12 ; CHECK-NEXT: [[TMP63:%.*]] = bitcast i32* [[TMP62]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP63]], i32 4, <4 x i1> [[TMP51]], <4 x i32> undef) +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP63]], align 4 ; CHECK-NEXT: [[TMP64:%.*]] = xor <4 x i1> [[TMP27]], ; CHECK-NEXT: [[TMP65:%.*]] = xor <4 x i1> [[TMP35]], ; CHECK-NEXT: [[TMP66:%.*]] = xor <4 x i1> [[TMP43]], ; CHECK-NEXT: [[TMP67:%.*]] = xor <4 x i1> [[TMP51]], -; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP27]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI10:%.*]] = select <4 x i1> [[TMP35]], <4 x i32> [[WIDE_MASKED_LOAD7]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI11:%.*]] = select <4 x i1> [[TMP43]], <4 x i32> [[WIDE_MASKED_LOAD8]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[PREDPHI12:%.*]] = select <4 x i1> [[TMP51]], <4 x i32> [[WIDE_MASKED_LOAD9]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP27]], <4 x i32> [[WIDE_LOAD]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI10:%.*]] = select <4 x i1> [[TMP35]], <4 x i32> [[WIDE_LOAD7]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI11:%.*]] = select <4 x i1> [[TMP43]], <4 x i32> [[WIDE_LOAD8]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[PREDPHI12:%.*]] = select <4 x i1> [[TMP51]], <4 x i32> [[WIDE_LOAD9]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP68]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]] ; CHECK-NEXT: [[TMP69]] = add <4 x i32> [[VEC_PHI4]], [[PREDPHI10]] ; CHECK-NEXT: [[TMP70]] = add <4 x i32> [[VEC_PHI5]], [[PREDPHI11]]