Index: lib/Transforms/Vectorize/LoadStoreVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -111,8 +111,8 @@ /// Checks if there are any instructions which may affect the memory accessed /// in the chain between \p From and \p To. The elements of \p Chain should be /// all loads or all stores. - bool isVectorizable(ArrayRef Chain, BasicBlock::iterator From, - BasicBlock::iterator To); + unsigned isVectorizable(ArrayRef Chain, BasicBlock::iterator From, + BasicBlock::iterator To); /// Collects load and store instructions to vectorize. void collectInstructions(BasicBlock *BB); @@ -125,10 +125,12 @@ bool vectorizeInstructions(ArrayRef Instrs); /// Vectorizes the load instructions in Chain. - bool vectorizeLoadChain(ArrayRef Chain); + bool vectorizeLoadChain(ArrayRef Chain, + SmallVector *VectorizedSet); /// Vectorizes the store instructions in Chain. - bool vectorizeStoreChain(ArrayRef Chain); + bool vectorizeStoreChain(ArrayRef Chain, + SmallVector *VectorizedSet); /// Query target for allowed misaligned accesses bool allowsMisalignedAndIsFast(unsigned SzInBytes, unsigned AddressSpace, @@ -433,9 +435,9 @@ return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft)); } -bool Vectorizer::isVectorizable(ArrayRef Chain, - BasicBlock::iterator From, - BasicBlock::iterator To) { +unsigned Vectorizer::isVectorizable(ArrayRef Chain, + BasicBlock::iterator From, + BasicBlock::iterator To) { SmallVector, 16> MemoryInstrs; SmallVector, 16> ChainInstrs; @@ -448,19 +450,20 @@ ChainInstrs.push_back({&*I, Idx}); } else if (I->mayHaveSideEffects()) { DEBUG(dbgs() << "LSV: Found side-effecting operation: " << *I << '\n'); - return false; + return 0; } } assert(Chain.size() == ChainInstrs.size() && "All instructions in the Chain must exist in [From, To)."); - for (auto EntryMem : MemoryInstrs) { - Value *V = EntryMem.first; - unsigned VIdx = EntryMem.second; - for (auto EntryChain : ChainInstrs) { - Value *VV = EntryChain.first; - unsigned VVIdx = EntryChain.second; + Idx = 0; + for (auto EntryChain : ChainInstrs) { + Value *VV = EntryChain.first; + unsigned VVIdx = EntryChain.second; + for (auto EntryMem : MemoryInstrs) { + Value *V = EntryMem.first; + unsigned VIdx = EntryMem.second; if (isa(V) && isa(VV)) continue; @@ -490,12 +493,13 @@ << *VV << " aliases " << *Ptr1 << '\n'; }); - return false; + return Idx; } } + Idx++; } - return true; + return Chain.size(); } void Vectorizer::collectInstructions(BasicBlock *BB) { @@ -628,8 +632,16 @@ bool Changed = false; SmallPtrSet VectorizedValues; - for (int Head : Heads) { - if (Tails.count(Head)) + for (unsigned i = 0; i < Heads.size(); i++) { + int Head = Heads[i]; + if (VectorizedValues.count(Instrs[Head])) + continue; + for (unsigned j = 0; j < Tails.size(); j++) + if (Head == Tails[j] && !VectorizedValues.count(Instrs[Heads[j]])) { + Head = -1; + break; + } + if (Head < 0) continue; // We found an instr that starts a chain. Now follow the chain and try to @@ -645,26 +657,28 @@ } bool Vectorized = false; + SmallVector VectorizedSet; if (isa(*Operands.begin())) - Vectorized = vectorizeLoadChain(Operands); + Vectorized = vectorizeLoadChain(Operands, &VectorizedSet); else - Vectorized = vectorizeStoreChain(Operands); + Vectorized = vectorizeStoreChain(Operands, &VectorizedSet); // Mark the vectorized instructions so that we don't vectorize them again. - if (Vectorized) - VectorizedValues.insert(Operands.begin(), Operands.end()); + if (VectorizedSet.size()) + VectorizedValues.insert(VectorizedSet.begin(), VectorizedSet.end()); Changed |= Vectorized; } return Changed; } -bool Vectorizer::vectorizeStoreChain(ArrayRef Chain) { - StoreInst *S0 = cast(Chain[0]); +bool Vectorizer::vectorizeStoreChain(ArrayRef FullChain, + SmallVector *VectorizedSet) { + StoreInst *S0 = cast(FullChain[0]); // If the vector has an int element, default to int for the whole load. Type *StoreTy; - for (const auto &V : Chain) { + for (const auto &V : FullChain) { StoreTy = cast(V)->getValueOperand()->getType(); if (StoreTy->isIntOrIntVectorTy()) break; @@ -680,10 +694,31 @@ unsigned AS = S0->getPointerAddressSpace(); unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); unsigned VF = VecRegSize / Sz; - unsigned ChainSize = Chain.size(); + unsigned ChainSize = FullChain.size(); - if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) + if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { + VectorizedSet->append(FullChain.begin(), FullChain.end()); return false; + } + + // Get enclosing instructions in BB + BasicBlock::iterator First, Last; + std::tie(First, Last) = getBoundaryInstrs(FullChain); + // Get the largest vectorizable prefix of the chain + unsigned StopChain = isVectorizable(FullChain, First, Last); + if (StopChain < 2) { + // There exists a side effect instruction, no vectorization possible + if (!StopChain) + VectorizedSet->append(FullChain.begin(), FullChain.end()); + // Failed after the first instruction, discard it and retry the smaller + // chain + else + VectorizedSet->append(FullChain.begin(), FullChain.begin() + 1); + return false; + } + + ArrayRef Chain = FullChain.slice(0, StopChain); + ChainSize = Chain.size(); // Store size should be 1B, 2B or multiple of 4B. // TODO: Target hook for size constraint? @@ -692,11 +727,11 @@ DEBUG(dbgs() << "LSV: Size should be 1B, 2B " "or multiple of 4B. Splitting.\n"); if (SzInBytes == 3) - return vectorizeStoreChain(Chain.slice(0, ChainSize - 1)); + return vectorizeStoreChain(Chain.slice(0, ChainSize - 1), VectorizedSet); auto Chains = splitOddVectorElts(Chain, Sz); - return vectorizeStoreChain(Chains.first) | - vectorizeStoreChain(Chains.second); + return vectorizeStoreChain(Chains.first, VectorizedSet) | + vectorizeStoreChain(Chains.second, VectorizedSet); } VectorType *VecTy; @@ -712,8 +747,8 @@ if (ChainSize > VF) { DEBUG(dbgs() << "LSV: Vector factor is too big." " Creating two separate arrays.\n"); - return vectorizeStoreChain(Chain.slice(0, VF)) | - vectorizeStoreChain(Chain.slice(VF)); + return vectorizeStoreChain(Chain.slice(0, VF), VectorizedSet) | + vectorizeStoreChain(Chain.slice(VF), VectorizedSet); } DEBUG({ @@ -722,6 +757,10 @@ V->dump(); }); + // The chain is discarded for future vectorization regardless of the result + // below + VectorizedSet->append(Chain.begin(), Chain.end()); + // Check alignment restrictions. unsigned Alignment = getAlignment(S0); @@ -744,12 +783,6 @@ } } - BasicBlock::iterator First, Last; - std::tie(First, Last) = getBoundaryInstrs(Chain); - - if (!isVectorizable(Chain, First, Last)) - return false; - // Set insert point. Builder.SetInsertPoint(&*Last); @@ -797,12 +830,13 @@ return true; } -bool Vectorizer::vectorizeLoadChain(ArrayRef Chain) { - LoadInst *L0 = cast(Chain[0]); +bool Vectorizer::vectorizeLoadChain(ArrayRef FullChain, + SmallVector *VectorizedSet) { + LoadInst *L0 = cast(FullChain[0]); // If the vector has an int element, default to int for the whole load. Type *LoadTy; - for (const auto &V : Chain) { + for (const auto &V : FullChain) { LoadTy = cast(V)->getType(); if (LoadTy->isIntOrIntVectorTy()) break; @@ -818,10 +852,31 @@ unsigned AS = L0->getPointerAddressSpace(); unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); unsigned VF = VecRegSize / Sz; - unsigned ChainSize = Chain.size(); + unsigned ChainSize = FullChain.size(); - if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) + if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) { + VectorizedSet->append(FullChain.begin(), FullChain.end()); return false; + } + + // Get enclosing instructions in BB + BasicBlock::iterator First, Last; + std::tie(First, Last) = getBoundaryInstrs(FullChain); + // Get the largest vectorizable prefix of the chain + unsigned StopChain = isVectorizable(FullChain, First, Last); + if (StopChain < 2) { + // There exists a side effect instruction, no vectorization possible + if (!StopChain) + VectorizedSet->append(FullChain.begin(), FullChain.end()); + // Failed after the first instruction, discard it and retry the smaller + // chain + else + VectorizedSet->append(FullChain.begin(), FullChain.begin() + 1); + return false; + } + + ArrayRef Chain = FullChain.slice(0, StopChain); + ChainSize = Chain.size(); // Load size should be 1B, 2B or multiple of 4B. // TODO: Should size constraint be a target hook? @@ -830,9 +885,10 @@ DEBUG(dbgs() << "LSV: Size should be 1B, 2B " "or multiple of 4B. Splitting.\n"); if (SzInBytes == 3) - return vectorizeLoadChain(Chain.slice(0, ChainSize - 1)); + return vectorizeLoadChain(Chain.slice(0, ChainSize - 1), VectorizedSet); auto Chains = splitOddVectorElts(Chain, Sz); - return vectorizeLoadChain(Chains.first) | vectorizeLoadChain(Chains.second); + return vectorizeLoadChain(Chains.first, VectorizedSet) | + vectorizeLoadChain(Chains.second, VectorizedSet); } VectorType *VecTy; @@ -848,10 +904,14 @@ if (ChainSize > VF) { DEBUG(dbgs() << "LSV: Vector factor is too big. " "Creating two separate arrays.\n"); - return vectorizeLoadChain(Chain.slice(0, VF)) | - vectorizeLoadChain(Chain.slice(VF)); + return vectorizeLoadChain(Chain.slice(0, VF), VectorizedSet) | + vectorizeLoadChain(Chain.slice(VF), VectorizedSet); } + // The chain is discarded for future vectorization regardless of the result + // below + VectorizedSet->append(Chain.begin(), Chain.end()); + // Check alignment restrictions. unsigned Alignment = getAlignment(L0); @@ -880,12 +940,6 @@ V->dump(); }); - BasicBlock::iterator First, Last; - std::tie(First, Last) = getBoundaryInstrs(Chain); - - if (!isVectorizable(Chain, First, Last)) - return false; - // Set insert point. Builder.SetInsertPoint(&*First); Index: test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll +++ test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll @@ -48,8 +48,7 @@ ; CHECK-LABEL: @chain_suffix( ; CHECK: load i32 ; CHECK: store <2 x i32> -; CHECK: load i32 -; CHECK: load i32 +; CHECK: load <2 x i32> define void @chain_suffix(i32* noalias %ptr) { %next.gep = getelementptr i32, i32* %ptr, i64 0 %next.gep1 = getelementptr i32, i32* %ptr, i64 1 @@ -66,12 +65,9 @@ ; CHECK-LABEL: @chain_prefix_suffix( -; CHECK: load i32 -; CHECK: load i32 +; CHECK: load <2 x i32> ; CHECK: store <2 x i32> -; CHECK: load i32 -; CHECK: load i32 -; CHECK: load i32 +; CHECK: load <3 x i32> define void @chain_prefix_suffix(i32* noalias %ptr) { %next.gep = getelementptr i32, i32* %ptr, i64 0 %next.gep1 = getelementptr i32, i32* %ptr, i64 1