Index: lib/Transforms/Vectorize/LoadStoreVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -104,13 +104,12 @@ std::pair, ArrayRef> splitOddVectorElts(ArrayRef Chain, unsigned ElementSizeBits); - /// Checks for instructions which may affect the memory accessed - /// in the chain between \p From and \p To. Returns Index, where - /// \p Chain[0, Index) is the largest vectorizable chain prefix. - /// The elements of \p Chain should be all loads or all stores. - unsigned getVectorizablePrefixEndIdx(ArrayRef Chain, - BasicBlock::iterator From, - BasicBlock::iterator To); + /// Finds the largest prefix of Chain that's vectorizable, checking for + /// intervening instructions which may affect the memory accessed by the + /// instructions within Chain. + /// + /// The elements of \p Chain must be all loads or all stores. + ArrayRef getVectorizablePrefix(ArrayRef Chain); /// Collects load and store instructions to vectorize. std::pair collectInstructions(BasicBlock *BB); @@ -424,14 +423,12 @@ return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft)); } -unsigned Vectorizer::getVectorizablePrefixEndIdx(ArrayRef Chain, - BasicBlock::iterator From, - BasicBlock::iterator To) { +ArrayRef Vectorizer::getVectorizablePrefix(ArrayRef Chain) { SmallVector, 16> MemoryInstrs; SmallVector, 16> ChainInstrs; unsigned InstrIdx = 0; - for (Instruction &I : make_range(From, To)) { + for (Instruction &I : make_range(getBoundaryInstrs(Chain))) { ++InstrIdx; if (isa(I) || isa(I)) { if (!is_contained(Chain, &I)) @@ -445,7 +442,7 @@ } assert(Chain.size() == ChainInstrs.size() && - "All instructions in the Chain must exist in [From, To)."); + "All instrs in Chain must be within range getBoundaryInstrs(Chain)."); unsigned ChainIdx = 0; for (auto EntryChain : ChainInstrs) { @@ -487,12 +484,12 @@ << *Ptr1 << '\n'; }); - return ChainIdx; + return Chain.slice(0, ChainIdx); } } ChainIdx++; } - return Chain.size(); + return Chain; } std::pair @@ -694,22 +691,20 @@ return false; } - BasicBlock::iterator First, Last; - std::tie(First, Last) = getBoundaryInstrs(Chain); - unsigned StopChain = getVectorizablePrefixEndIdx(Chain, First, Last); - if (StopChain == 0) { + ArrayRef NewChain = getVectorizablePrefix(Chain); + if (NewChain.empty()) { // There exists a side effect instruction, no vectorization possible. InstructionsProcessed->insert(Chain.begin(), Chain.end()); return false; } - if (StopChain == 1) { + if (NewChain.size() == 1) { // Failed after the first instruction. Discard it and try the smaller chain. - InstructionsProcessed->insert(Chain.front()); + InstructionsProcessed->insert(NewChain.front()); return false; } // Update Chain to the valid vectorizable subchain. - Chain = Chain.slice(0, StopChain); + Chain = NewChain; ChainSize = Chain.size(); // Store size should be 1B, 2B or multiple of 4B. @@ -773,7 +768,8 @@ } } - // Set insert point. + BasicBlock::iterator First, Last; + std::tie(First, Last) = getBoundaryInstrs(Chain); Builder.SetInsertPoint(&*Last); Value *Vec = UndefValue::get(VecTy); @@ -849,22 +845,20 @@ return false; } - BasicBlock::iterator First, Last; - std::tie(First, Last) = getBoundaryInstrs(Chain); - unsigned StopChain = getVectorizablePrefixEndIdx(Chain, First, Last); - if (StopChain == 0) { + ArrayRef NewChain = getVectorizablePrefix(Chain); + if (NewChain.empty()) { // There exists a side effect instruction, no vectorization possible. InstructionsProcessed->insert(Chain.begin(), Chain.end()); return false; } - if (StopChain == 1) { + if (NewChain.size() == 1) { // Failed after the first instruction. Discard it and try the smaller chain. - InstructionsProcessed->insert(Chain.front()); + InstructionsProcessed->insert(NewChain.front()); return false; } // Update Chain to the valid vectorizable subchain. - Chain = Chain.slice(0, StopChain); + Chain = NewChain; ChainSize = Chain.size(); // Load size should be 1B, 2B or multiple of 4B. @@ -927,7 +921,8 @@ V->dump(); }); - // Set insert point. + BasicBlock::iterator First, Last; + std::tie(First, Last) = getBoundaryInstrs(Chain); Builder.SetInsertPoint(&*First); Value *Bitcast = Index: test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll @@ -2,8 +2,9 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" -; Check relative position of the inserted vector load relative to the existing -; adds. Vectorized loads should be inserted at the position of the first load. +; Check position of the inserted vector load/store. Vectorized loads should be +; inserted at the position of the first load in the chain, and stores should be +; inserted at the position of the last store. ; CHECK-LABEL: @insert_load_point( ; CHECK: %z = add i32 %x, 4 @@ -59,4 +60,30 @@ ret void } +; Here we have four stores, with an aliasing load before the last one. We can +; vectorize the first two stores as <2 x float>, but this vectorized store must +; be inserted at the location of the second scalar store, not the fourth one. +; +; CHECK-LABEL: @insert_store_point_alias +; CHECK: store <2 x float> +; CHECK: store float +; CHECK-SAME: %a.idx.2 +; CHECK: load float, float addrspace(1)* %a.idx.2 +; CHECK: store float +; CHECK-SAME: %a.idx.3 +define float @insert_store_point_alias(float addrspace(1)* nocapture %a, i64 %idx) { + %a.idx = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx + %a.idx.1 = getelementptr inbounds float, float addrspace(1)* %a.idx, i64 1 + %a.idx.2 = getelementptr inbounds float, float addrspace(1)* %a.idx.1, i64 1 + %a.idx.3 = getelementptr inbounds float, float addrspace(1)* %a.idx.2, i64 1 + + store float 0.0, float addrspace(1)* %a.idx, align 4 + store float 0.0, float addrspace(1)* %a.idx.1, align 4 + store float 0.0, float addrspace(1)* %a.idx.2, align 4 + %x = load float, float addrspace(1)* %a.idx.2, align 4 + store float 0.0, float addrspace(1)* %a.idx.3, align 4 + + ret float %x +} + attributes #0 = { nounwind }