diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -35,6 +35,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/DemandedBits.h" +#include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopAccessAnalysis.h" @@ -62,6 +63,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/NoFolder.h" #include "llvm/IR/Operator.h" @@ -85,8 +87,11 @@ #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/InjectTLIMappings.h" #include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include "llvm/Transforms/Vectorize.h" #include #include @@ -578,6 +583,44 @@ return Index; } +static bool extendMemBounds( + Instruction &I, bool Insert, ScalarEvolution &SE, + DenseMap> &MemBounds) { + + BasicBlock *BB = I.getParent(); + auto GetPtr = [](Instruction *I) -> Value * { + if (auto *L = dyn_cast(I)) + return L->getPointerOperand(); + if (auto *S = dyn_cast(I)) + return S->getPointerOperand(); + return nullptr; + }; + auto *Ptr = GetPtr(&I); + if (!Ptr) + return false; + auto *PtrSCEV = SE.getSCEV(Ptr); + + Value *Obj = getUnderlyingObject(Ptr); + if (!Obj) + return false; + + if (!SE.properlyDominates(PtrSCEV, BB)) + return false; + + if (Insert) + MemBounds.insert({Obj, {PtrSCEV, PtrSCEV}}); + auto BoundsIter = MemBounds.find(Obj); + if (BoundsIter == MemBounds.end()) + return false; + + if (SE.isKnownPredicate(CmpInst::ICMP_ULT, PtrSCEV, BoundsIter->second.first)) + BoundsIter->second.first = PtrSCEV; + if (SE.isKnownPredicate(CmpInst::ICMP_UGT, PtrSCEV, + BoundsIter->second.second)) + BoundsIter->second.second = PtrSCEV; + + return true; +} namespace slpvectorizer { /// Bottom Up SLP Vectorizer. @@ -586,6 +629,16 @@ struct ScheduleData; public: + // Map of objects to start & end pointers we need to generate runtime checks + // for. + DenseMap> MemBounds; + /// Cache for alias results. + /// TODO: consider moving this to the AliasAnalysis itself. + using AliasCacheKey = std::pair; + DenseMap> AliasCache; + + bool CollectMemAccess = false; + using ValueList = SmallVector; using InstrList = SmallVector; using ValueSet = SmallPtrSet; @@ -664,6 +717,7 @@ } MinBWs.clear(); InstrElementSize.clear(); + MemBounds.clear(); } unsigned getTreeSize() const { return VectorizableTree.size(); } @@ -1962,11 +2016,6 @@ return aliased; } - using AliasCacheKey = std::pair; - - /// Cache for alias results. - /// TODO: consider moving this to the AliasAnalysis itself. - DenseMap> AliasCache; /// Removes an instruction from its block and eventually deletes it. /// It's like Instruction::eraseFromParent() except that the actual deletion @@ -2567,11 +2616,9 @@ "trying to erase instruction with users."); Pair.getFirst()->eraseFromParent(); } -#ifdef EXPENSIVE_CHECKS // If we could guarantee that this call is not extremely slow, we could // remove the ifdef limitation (see PR47712). assert(!verifyFunction(*F, &dbgs())); -#endif } void BoUpSLP::eraseInstructions(ArrayRef AV) { @@ -6097,6 +6144,7 @@ while (DepDest) { assert(isInSchedulingRegion(DepDest)); + ScheduleData *DestBundle = DepDest->FirstInBundle; // We have two limits to reduce the complexity: // 1) AliasedCheckLimit: It's a small limit to reduce calls to // SLP->isAliased (which is the expensive part in this loop). @@ -6114,9 +6162,33 @@ // balance between reduced runtime and accurate dependencies. numAliased++; + // If this bundle is not scheduled and no versioned code has been + // generated yet, try to collect the bounds of the accesses to + // generate runtime checks. + if (!DestBundle->IsScheduled && SLP->CollectMemAccess) { + // FIXME Naming + auto GetPtr = [](Instruction *I) -> Value * { + if (auto *L = dyn_cast(I)) + return L->getPointerOperand(); + if (auto *S = dyn_cast(I)) + return S->getPointerOperand(); + return nullptr; + }; + auto *Src = GetPtr(SrcInst); + auto *Dst = GetPtr(DepDest->Inst); + + if (SrcInst->getParent() == DepDest->Inst->getParent() && Src && + Dst) { + bool AddedSrc = + extendMemBounds(*SrcInst, true, *SLP->SE, SLP->MemBounds); + bool AddedDst = extendMemBounds(*DepDest->Inst, true, + *SLP->SE, SLP->MemBounds); + if (!AddedSrc || !AddedDst) + SLP->MemBounds.clear(); + } + } DepDest->MemoryDependencies.push_back(BundleMember); BundleMember->Dependencies++; - ScheduleData *DestBundle = DepDest->FirstInBundle; if (!DestBundle->IsScheduled) { BundleMember->incrementUnscheduledDeps(1); } @@ -6596,7 +6668,7 @@ return PreservedAnalyses::all(); PreservedAnalyses PA; - PA.preserveSet(); + // PA.preserveSet(); return PA; } @@ -6643,6 +6715,9 @@ // Update DFS numbers now so that we can use them for ordering. DT->updateDFSNumbers(); + SmallVector BlocksToRetry; + SmallVector>, 4> + BoundsToUse; // Scan the blocks in the function in post order. for (auto BB : post_order(&F.getEntryBlock())) { collectSeedInstructions(BB); @@ -6651,7 +6726,40 @@ if (!Stores.empty()) { LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size() << " underlying objects.\n"); - Changed |= vectorizeStoreChains(R); + R.MemBounds.clear(); + + auto NoOrSingleSucc = [](BasicBlock *BB) { + return succ_begin(BB) == succ_end(BB) || + std::next(succ_begin(BB)) == succ_end(BB); + }; + auto NoOrSinglePred = [](BasicBlock *BB) { + return pred_begin(BB) == pred_end(BB) || + std::next(pred_begin(BB)) == pred_end(BB); + }; + + auto AllUsesInside = [](BasicBlock *BB) { + return all_of(*BB, [BB](Instruction &I) { + return all_of(I.users(), [BB](User *U) { + return cast(U)->getParent() == BB; + }); + }); + }; + auto TermSupported = [](BasicBlock *BB) { + auto *RetI = dyn_cast(BB->getTerminator()); + return isa(BB->getTerminator()) || + (RetI && !RetI->getReturnValue()); + }; + R.CollectMemAccess = NoOrSingleSucc(BB) && NoOrSinglePred(BB) && + AllUsesInside(BB) && TermSupported(BB); + + bool VectorizedChains = vectorizeStoreChains(R); + if (!VectorizedChains && !R.MemBounds.empty()) { + BlocksToRetry.push_back(BB); + BoundsToUse.push_back(R.MemBounds); + } + R.CollectMemAccess = false; + R.MemBounds.clear(); + Changed |= VectorizedChains; } // Vectorize trees that end at reductions. @@ -6667,6 +6775,161 @@ } } + R.AliasCache.clear(); + for (unsigned I = 0; I != BlocksToRetry.size(); I++) { + BasicBlock *BB = BlocksToRetry[I]; + auto &MemBounds = BoundsToUse[I]; + + for (Instruction &I : *BB) + extendMemBounds(I, false, *SE, MemBounds); + + LLVMContext &Ctx = BB->getContext(); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + std::string OriginalName = BB->getName().str(); + auto *CheckBlock = splitBlockBefore(BB, &*BB->getFirstNonPHI(), &DTU, LI, + nullptr, OriginalName + ".slpmemcheck"); + auto *MergeBlock = BB; + BB = splitBlockBefore(BB, BB->getTerminator(), &DTU, LI, nullptr, + OriginalName + ".slpversioned"); + + ValueToValueMapTy VMap; + auto *Scalar = CloneBasicBlock(BB, VMap, "", BB->getParent()); + Scalar->setName(OriginalName + ".scalar"); + MergeBlock->setName(OriginalName + ".merge"); + SmallVector Tmp; + Tmp.push_back(Scalar); + remapInstructionsInBlocks(Tmp, VMap); + + Value *MemoryRuntimeCheck = nullptr; + Instruction *FirstInst = nullptr; + SCEVExpander Exp(*SE, BB->getParent()->getParent()->getDataLayout(), + "memcheck"); + SmallVector, 4> ExpandedBounds; + Type *PtrArithTy = + Type::getInt8PtrTy(BB->getParent()->getParent()->getContext(), 0); + for (auto &KV : MemBounds) { + ExpandedBounds.emplace_back( + Exp.expandCodeFor(KV.second.first, PtrArithTy, + CheckBlock->getTerminator()), + Exp.expandCodeFor(KV.second.second, PtrArithTy, + CheckBlock->getTerminator())); + } + auto GetFirstInst = [](Instruction *FirstInst, Value *V, + Instruction *Loc) -> Instruction * { + if (FirstInst) + return FirstInst; + if (Instruction *I = dyn_cast(V)) + return I->getParent() == Loc->getParent() ? I : nullptr; + return nullptr; + }; + + Instruction *Loc = CheckBlock->getTerminator(); + IRBuilder<> ChkBuilder(CheckBlock->getTerminator()); + for (unsigned i = 0; i < MemBounds.size(); ++i) { + for (unsigned j = i + 1; j < MemBounds.size(); ++j) { + Value *ALow = ExpandedBounds[i].first; + Value *AHigh = ExpandedBounds[i].second; + Value *BLow = ExpandedBounds[j].first; + Value *BHigh = ExpandedBounds[j].second; + + unsigned AS0 = ALow->getType()->getPointerAddressSpace(); + unsigned AS1 = BLow->getType()->getPointerAddressSpace(); + + Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0); + Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1); + Value *Start0 = ChkBuilder.CreateBitCast(ALow, PtrArithTy0, "bc"); + Value *Start1 = ChkBuilder.CreateBitCast(BLow, PtrArithTy1, "bc"); + Value *End0 = ChkBuilder.CreateBitCast(AHigh, PtrArithTy1, "bc"); + Value *End1 = ChkBuilder.CreateBitCast(BHigh, PtrArithTy0, "bc"); + // [A|B].Start points to the first accessed byte under base [A|B]. + // [A|B].End points to the last accessed byte, plus one. + // There is no conflict when the intervals are disjoint: + // NoConflict = (B.Start >= A.End) || (A.Start >= B.End) + // + // bound0 = (B.Start < A.End) + // bound1 = (A.Start < B.End) + // IsConflict = bound0 & bound1 + Value *Cmp0 = ChkBuilder.CreateICmpULT(Start0, End1, "bound0"); + FirstInst = GetFirstInst(FirstInst, Cmp0, Loc); + Value *Cmp1 = ChkBuilder.CreateICmpULT(Start1, End0, "bound1"); + FirstInst = GetFirstInst(FirstInst, Cmp1, Loc); + Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict"); + FirstInst = GetFirstInst(FirstInst, IsConflict, Loc); + if (MemoryRuntimeCheck) { + IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, + "conflict.rdx"); + FirstInst = GetFirstInst(FirstInst, IsConflict, Loc); + } + MemoryRuntimeCheck = IsConflict; + } + } + + ChkBuilder.CreateCondBr(MemoryRuntimeCheck, BB, Scalar); + CheckBlock->getTerminator()->eraseFromParent(); + DTU.applyUpdates({{DT->Insert, CheckBlock, Scalar}}); + Changed = true; + + MDBuilder MDB(Ctx); + MDNode *Domain = MDB.createAnonymousAliasScopeDomain("SLPVerDomain"); + + DenseMap *, MDNode *> + GroupToScope; + for (const auto &Group : MemBounds) + GroupToScope[&Group.second] = MDB.createAnonymousAliasScope(Domain); + + for (Instruction &I : *BB) { + auto GetPtr = [](Instruction *I) -> Value * { + if (auto *L = dyn_cast(I)) + return L->getPointerOperand(); + if (auto *S = dyn_cast(I)) + return S->getPointerOperand(); + return nullptr; + }; + auto *Ptr = GetPtr(&I); + if (!Ptr) + continue; + auto *PtrSCEV = SE->getSCEV(Ptr); + + Value *Obj = getUnderlyingObject(Ptr); + if (!Obj) + continue; + + auto BoundsIter = MemBounds.find(Obj); + if (BoundsIter == MemBounds.end()) + continue; + auto *LowerBound = BoundsIter->second.first; + auto *UpperBound = BoundsIter->second.second; + auto *Scope = GroupToScope.find(&BoundsIter->second)->second; + if (SE->isKnownPredicate(CmpInst::ICMP_UGE, PtrSCEV, LowerBound) && + SE->isKnownPredicate(CmpInst::ICMP_ULE, PtrSCEV, UpperBound)) { + I.setMetadata( + LLVMContext::MD_alias_scope, + MDNode::concatenate(I.getMetadata(LLVMContext::MD_alias_scope), + MDNode::get(Ctx, Scope))); + + SmallVector NonAliasing; + for (auto &KV : GroupToScope) { + if (KV.first == &BoundsIter->second) + continue; + NonAliasing.push_back(KV.second); + } + I.setMetadata( + LLVMContext::MD_noalias, + MDNode::concatenate(I.getMetadata(LLVMContext::MD_noalias), + MDNode::get(Ctx, NonAliasing))); + } + } + + collectSeedInstructions(BB); + + // Vectorize trees that end at stores. + if (!Stores.empty()) { + LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size() + << " underlying objects.\n"); + Changed |= vectorizeStoreChains(R); + } + } + if (Changed) { R.optimizeGatherSequence(); LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll @@ -3,16 +3,38 @@ define void @needs_versioning(i32* %dst, i32* %src) { ; CHECK-LABEL: @needs_versioning( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[SRC_0:%.*]] = load i32, i32* [[SRC:%.*]], align 4 -; CHECK-NEXT: [[R_0:%.*]] = ashr i32 [[SRC_0]], 16 -; CHECK-NEXT: store i32 [[R_0]], i32* [[DST:%.*]], align 4 +; CHECK-NEXT: entry.slpmemcheck: +; CHECK-NEXT: [[SRC8:%.*]] = bitcast i32* [[SRC:%.*]] to i8* +; CHECK-NEXT: [[DST10:%.*]] = bitcast i32* [[DST:%.*]] to i8* +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[SRC]], i64 1 +; CHECK-NEXT: [[SCEVGEP9:%.*]] = bitcast i32* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[DST]], i64 1 +; CHECK-NEXT: [[SCEVGEP1112:%.*]] = bitcast i32* [[SCEVGEP11]] to i8* +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SRC8]], [[SCEVGEP1112]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[DST10]], [[SCEVGEP9]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[ENTRY_SLPVERSIONED:%.*]], label [[ENTRY_SCALAR:%.*]] +; CHECK: entry.slpversioned: ; CHECK-NEXT: [[SRC_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 1 -; CHECK-NEXT: [[SRC_1:%.*]] = load i32, i32* [[SRC_GEP_1]], align 4 -; CHECK-NEXT: [[R_1:%.*]] = ashr i32 [[SRC_1]], 16 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <2 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4, !alias.scope !0, !noalias !3 +; CHECK-NEXT: [[TMP2:%.*]] = ashr <2 x i32> [[TMP1]], ; CHECK-NEXT: [[DST_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 1 -; CHECK-NEXT: store i32 [[R_1]], i32* [[DST_GEP_1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP2]], <2 x i32>* [[TMP3]], align 4, !alias.scope !3, !noalias !0 +; CHECK-NEXT: br label [[ENTRY_MERGE:%.*]] +; CHECK: entry.merge: ; CHECK-NEXT: ret void +; CHECK: entry.scalar: +; CHECK-NEXT: [[SRC_02:%.*]] = load i32, i32* [[SRC]], align 4 +; CHECK-NEXT: [[R_03:%.*]] = ashr i32 [[SRC_02]], 16 +; CHECK-NEXT: store i32 [[R_03]], i32* [[DST]], align 4 +; CHECK-NEXT: [[SRC_GEP_14:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 1 +; CHECK-NEXT: [[SRC_15:%.*]] = load i32, i32* [[SRC_GEP_14]], align 4 +; CHECK-NEXT: [[R_16:%.*]] = ashr i32 [[SRC_15]], 16 +; CHECK-NEXT: [[DST_GEP_17:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 1 +; CHECK-NEXT: store i32 [[R_16]], i32* [[DST_GEP_17]], align 4 +; CHECK-NEXT: br label [[ENTRY_MERGE]] ; entry: %src.0 = load i32, i32* %src, align 4 @@ -52,11 +74,22 @@ define void @version_multiple(i32* nocapture %out_block, i32* nocapture readonly %counter) { ; CHECK-LABEL: @version_multiple( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[COUNTER:%.*]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[OUT_BLOCK:%.*]], align 4 +; CHECK-NEXT: entry.slpmemcheck: +; CHECK-NEXT: [[COUNTER12:%.*]] = bitcast i32* [[COUNTER:%.*]] to i8* +; CHECK-NEXT: [[OUT_BLOCK14:%.*]] = bitcast i32* [[OUT_BLOCK:%.*]] to i8* +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[COUNTER]], i64 3 +; CHECK-NEXT: [[SCEVGEP13:%.*]] = bitcast i32* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[SCEVGEP15:%.*]] = getelementptr i32, i32* [[OUT_BLOCK]], i64 2 +; CHECK-NEXT: [[SCEVGEP1516:%.*]] = bitcast i32* [[SCEVGEP15]] to i8* +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[COUNTER12]], [[SCEVGEP1516]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[OUT_BLOCK14]], [[SCEVGEP13]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[ENTRY_SLPVERSIONED:%.*]], label [[ENTRY_SCALAR:%.*]] +; CHECK: entry.slpversioned: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[COUNTER]], align 4, !alias.scope !5, !noalias !8 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[OUT_BLOCK]], align 4, !alias.scope !8, !noalias !5 ; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP1]], [[TMP0]] -; CHECK-NEXT: store i32 [[XOR]], i32* [[OUT_BLOCK]], align 4 +; CHECK-NEXT: store i32 [[XOR]], i32* [[OUT_BLOCK]], align 4, !alias.scope !8, !noalias !5 ; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 1 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 ; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 1 @@ -64,18 +97,43 @@ ; CHECK-NEXT: [[XOR_1:%.*]] = xor i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: store i32 [[XOR_1]], i32* [[ARRAYIDX2_1]], align 4 ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 2 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 ; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 2 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX2_2]], align 4 -; CHECK-NEXT: [[XOR_2:%.*]] = xor i32 [[TMP5]], [[TMP4]] -; CHECK-NEXT: store i32 [[XOR_2]], i32* [[ARRAYIDX2_2]], align 4 ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 3 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[ARRAYIDX_2]] to <2 x i32>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4 ; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 3 -; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX2_3]], align 4 -; CHECK-NEXT: [[XOR_3:%.*]] = xor i32 [[TMP7]], [[TMP6]] -; CHECK-NEXT: store i32 [[XOR_3]], i32* [[ARRAYIDX2_3]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[ARRAYIDX2_2]] to <2 x i32>* +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[TMP7]], [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[ARRAYIDX2_2]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP8]], <2 x i32>* [[TMP9]], align 4 +; CHECK-NEXT: br label [[ENTRY_MERGE:%.*]] +; CHECK: entry.merge: ; CHECK-NEXT: ret void +; CHECK: entry.scalar: +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[COUNTER]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[OUT_BLOCK]], align 4 +; CHECK-NEXT: [[XOR2:%.*]] = xor i32 [[TMP11]], [[TMP10]] +; CHECK-NEXT: store i32 [[XOR2]], i32* [[OUT_BLOCK]], align 4 +; CHECK-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 1 +; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX_13]], align 4 +; CHECK-NEXT: [[ARRAYIDX2_14:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 1 +; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX2_14]], align 4 +; CHECK-NEXT: [[XOR_15:%.*]] = xor i32 [[TMP13]], [[TMP12]] +; CHECK-NEXT: store i32 [[XOR_15]], i32* [[ARRAYIDX2_14]], align 4 +; CHECK-NEXT: [[ARRAYIDX_26:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 2 +; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX_26]], align 4 +; CHECK-NEXT: [[ARRAYIDX2_27:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 2 +; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX2_27]], align 4 +; CHECK-NEXT: [[XOR_28:%.*]] = xor i32 [[TMP15]], [[TMP14]] +; CHECK-NEXT: store i32 [[XOR_28]], i32* [[ARRAYIDX2_27]], align 4 +; CHECK-NEXT: [[ARRAYIDX_39:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 3 +; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX_39]], align 4 +; CHECK-NEXT: [[ARRAYIDX2_310:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 3 +; CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[ARRAYIDX2_310]], align 4 +; CHECK-NEXT: [[XOR_311:%.*]] = xor i32 [[TMP17]], [[TMP16]] +; CHECK-NEXT: store i32 [[XOR_311]], i32* [[ARRAYIDX2_310]], align 4 +; CHECK-NEXT: br label [[ENTRY_MERGE]] ; entry: %0 = load i32, i32* %counter, align 4