Index: lib/Transforms/Vectorize/LoadStoreVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -9,7 +9,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" @@ -31,6 +30,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Vectorize.h" using namespace llvm; @@ -60,9 +60,8 @@ public: Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI) - : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI), - DL(F.getParent()->getDataLayout()), - Builder(SE.getContext()) {} + : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI), + DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {} bool run(); @@ -91,7 +90,9 @@ /// Reorders the users of I after vectorization to ensure that I dominates its /// users. - void reorder(Instruction *I); + void reorderBefore(Instruction *I); + void reorderBeforeHelper(Instruction *I, + SmallPtrSet &InstructionsToMove); /// Returns the first and the last instructions in Chain. std::pair @@ -182,8 +183,8 @@ AliasAnalysis &AA = getAnalysis().getAAResults(); DominatorTree &DT = getAnalysis().getDomTree(); ScalarEvolution &SE = getAnalysis().getSE(); - TargetTransformInfo &TTI - = getAnalysis().getTTI(F); + TargetTransformInfo &TTI = + getAnalysis().getTTI(F); Vectorizer V(F, AA, DT, SE, TTI); return V.run(); @@ -237,7 +238,7 @@ if (PtrA == PtrB || DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) || DL.getTypeStoreSize(PtrATy->getScalarType()) != - DL.getTypeStoreSize(PtrBTy->getScalarType())) + DL.getTypeStoreSize(PtrBTy->getScalarType())) return false; APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy)); @@ -334,22 +335,50 @@ return X2 == OffsetSCEVB; } -void Vectorizer::reorder(Instruction *I) { - Instruction *InsertAfter = I; - for (User *U : I->users()) { - Instruction *User = dyn_cast(U); - if (!User || User->getOpcode() == Instruction::PHI) - continue; +void Vectorizer::reorderBeforeHelper( + Instruction *I, SmallPtrSet &InstructionsToMove) { + + SmallVector Worklist; + Worklist.insert(Worklist.end(), I); + while (Worklist.size()) { + auto LastElement = Worklist.end(); + --LastElement; + I = *LastElement; + Worklist.erase(LastElement); + int NumOperands = I->getNumOperands(); + for (int i = 0; i < NumOperands; i++) { + Instruction *IM = dyn_cast(I->getOperand(i)); + if (!IM || IM->getOpcode() == Instruction::PHI) + continue; - if (!DT.dominates(I, User)) { - User->removeFromParent(); - User->insertAfter(InsertAfter); - InsertAfter = User; - reorder(User); + if (!DT.dominates(IM, I)) { + InstructionsToMove.insert(IM); + Worklist.insert(Worklist.end(), IM); + } } } } +void Vectorizer::reorderBefore(Instruction *I) { + SmallPtrSet InstructionsToMove; + reorderBeforeHelper(I, InstructionsToMove); + + for (auto IM = I->getParent()->begin(), E = I->getParent()->end(); IM != E; + ++IM) { + if (!is_contained(InstructionsToMove, &*IM)) + continue; + Instruction *InstructionToMove = &*IM; + --IM; + InstructionToMove->removeFromParent(); + InstructionToMove->insertBefore(I); + InstructionsToMove.erase(InstructionToMove); + } + // This failure can occur if the helper marks for reordering instructions + // outside this BB + assert(InstructionsToMove.size() == 0 && + "Failed to reorder all instructions"); +} + std::pair Vectorizer::getBoundaryInstrs(ArrayRef Chain) { Instruction *C0 = cast(Chain[0]); @@ -414,9 +443,9 @@ for (auto I = From, E = To; I != E; ++I, ++Idx) { if (isa(I) || isa(I)) { if (!is_contained(Chain, &*I)) - MemoryInstrs.push_back({ &*I, Idx }); + MemoryInstrs.push_back({&*I, Idx}); else - ChainInstrs.push_back({ &*I, Idx }); + ChainInstrs.push_back({&*I, Idx}); } else if (I->mayHaveSideEffects()) { DEBUG(dbgs() << "LSV: Found side-effecting operation: " << *I << '\n'); return false; @@ -450,16 +479,14 @@ Instruction *M1 = cast(VV); if (!AA.isNoAlias(MemoryLocation::get(M0), MemoryLocation::get(M1))) { - DEBUG( - Value *Ptr0 = getPointerOperand(M0); - Value *Ptr1 = getPointerOperand(M1); + DEBUG(Value *Ptr0 = getPointerOperand(M0); + Value *Ptr1 = getPointerOperand(M1); - dbgs() << "LSV: Found alias.\n" - " Aliasing instruction and pointer:\n" - << *V << " aliases " << *Ptr0 << '\n' - << " Aliased instruction and pointer:\n" - << *VV << " aliases " << *Ptr1 << '\n' - ); + dbgs() << "LSV: Found alias.\n" + " Aliasing instruction and pointer:\n" + << *V << " aliases " << *Ptr0 << '\n' + << " Aliased instruction and pointer:\n" + << *VV << " aliases " << *Ptr1 << '\n'); return false; } @@ -500,8 +527,7 @@ continue; // Make sure all the users of a vector are constant-index extracts. - if (isa(Ty) && - !all_of(LI->users(), [LI](const User *U) { + if (isa(Ty) && !all_of(LI->users(), [LI](const User *U) { const Instruction *UI = cast(U); return isa(UI) && isa(UI->getOperand(1)); @@ -534,8 +560,7 @@ if (TySize > VecRegSize / 2) continue; - if (isa(Ty) && - !all_of(SI->users(), [SI](const User *U) { + if (isa(Ty) && !all_of(SI->users(), [SI](const User *U) { const Instruction *UI = cast(U); return isa(UI) && isa(UI->getOperand(1)); @@ -689,11 +714,8 @@ vectorizeStoreChain(Chain.slice(VF)); } - DEBUG( - dbgs() << "LSV: Stores to vectorize:\n"; - for (Value *V : Chain) - V->dump(); - ); + DEBUG(dbgs() << "LSV: Stores to vectorize:\n"; for (Value *V + : Chain) V->dump();); // Check alignment restrictions. unsigned Alignment = getAlignment(S0); @@ -740,8 +762,8 @@ if (Extract->getType() != StoreTy->getScalarType()) Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType()); - Value *Insert = Builder.CreateInsertElement(Vec, Extract, - Builder.getInt32(NewIdx)); + Value *Insert = + Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx)); Vec = Insert; } } @@ -750,16 +772,17 @@ StoreInst *Store = cast(Chain[I]); Value *Extract = Store->getValueOperand(); if (Extract->getType() != StoreTy->getScalarType()) - Extract = Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType()); + Extract = + Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType()); - Value *Insert = Builder.CreateInsertElement(Vec, Extract, - Builder.getInt32(I)); + Value *Insert = + Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I)); Vec = Insert; } } Value *Bitcast = - Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)); + Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)); StoreInst *SI = cast(Builder.CreateStore(Vec, Bitcast)); propagateMetadata(SI, Chain); SI->setAlignment(Alignment); @@ -785,7 +808,6 @@ DL.getTypeSizeInBits(LoadTy)); break; } - } unsigned Sz = DL.getTypeSizeInBits(LoadTy); @@ -801,7 +823,8 @@ // TODO: Should size constraint be a target hook? unsigned SzInBytes = (Sz / 8) * ChainSize; if (SzInBytes > 2 && SzInBytes % 4 != 0) { - DEBUG(dbgs() << "LSV: Size should be 1B, 2B or multiple of 4B. Splitting.\n"); + DEBUG( + dbgs() << "LSV: Size should be 1B, 2B or multiple of 4B. Splitting.\n"); if (SzInBytes == 3) return vectorizeLoadChain(Chain.slice(0, ChainSize - 1)); auto Chains = splitOddVectorElts(Chain, Sz); @@ -848,11 +871,8 @@ } } - DEBUG( - dbgs() << "LSV: Loads to vectorize:\n"; - for (Value *V : Chain) - V->dump(); - ); + DEBUG(dbgs() << "LSV: Loads to vectorize:\n"; for (Value *V + : Chain) V->dump();); BasicBlock::iterator First, Last; std::tie(First, Last) = getBoundaryInstrs(Chain); @@ -861,10 +881,10 @@ return false; // Set insert point. - Builder.SetInsertPoint(&*Last); + Builder.SetInsertPoint(&*First); Value *Bitcast = - Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS)); + Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS)); LoadInst *LI = cast(Builder.CreateLoad(Bitcast)); propagateMetadata(LI, Chain); @@ -873,6 +893,7 @@ if (VecLoadTy) { SmallVector InstrsToErase; SmallVector InstrsToReorder; + InstrsToReorder.push_back(cast(Bitcast)); unsigned VecWidth = VecLoadTy->getNumElements(); for (unsigned I = 0, E = Chain.size(); I != E; ++I) { @@ -883,40 +904,39 @@ Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx)); Instruction *Extracted = cast(V); if (Extracted->getType() != UI->getType()) - Extracted = - cast(Builder.CreateBitCast(Extracted, UI->getType())); + Extracted = cast( + Builder.CreateBitCast(Extracted, UI->getType())); // Replace the old instruction. UI->replaceAllUsesWith(Extracted); - InstrsToReorder.push_back(Extracted); InstrsToErase.push_back(UI); } } for (Instruction *ModUser : InstrsToReorder) - reorder(ModUser); + reorderBefore(ModUser); for (auto I : InstrsToErase) I->eraseFromParent(); } else { SmallVector InstrsToReorder; + InstrsToReorder.push_back(cast(Bitcast)); for (unsigned I = 0, E = Chain.size(); I != E; ++I) { Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(I)); Instruction *Extracted = cast(V); Instruction *UI = cast(Chain[I]); if (Extracted->getType() != UI->getType()) { - Extracted = - cast(Builder.CreateBitOrPointerCast(Extracted, UI->getType())); + Extracted = cast( + Builder.CreateBitOrPointerCast(Extracted, UI->getType())); } // Replace the old instruction. UI->replaceAllUsesWith(Extracted); - InstrsToReorder.push_back(Extracted); } for (Instruction *ModUser : InstrsToReorder) - reorder(ModUser); + reorderBefore(ModUser); } eraseInstructions(Chain); @@ -928,7 +948,6 @@ bool Vectorizer::allowsMisaligned(unsigned SzInBytes, unsigned AddressSpace, unsigned Alignment, bool *Fast) { - return TTI.allowsMisalignedMemoryAccesses(SzInBytes*8, AddressSpace, - Alignment, Fast); + return TTI.allowsMisalignedMemoryAccesses(SzInBytes * 8, AddressSpace, + Alignment, Fast); } - Index: test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll @@ -4,11 +4,12 @@ ; Check relative position of the inserted vector load relative to the ; existing adds. +; Vectorized loads should be inserted at the position of the first load. ; CHECK-LABEL: @insert_load_point( ; CHECK: %z = add i32 %x, 4 -; CHECK: %w = add i32 %y, 9 ; CHECK: load <2 x float> +; CHECK: %w = add i32 %y, 9 ; CHECK: %foo = add i32 %z, %w define void @insert_load_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 { entry: Index: test/Transforms/LoadStoreVectorizer/X86/correct-order.ll =================================================================== --- /dev/null +++ test/Transforms/LoadStoreVectorizer/X86/correct-order.ll @@ -0,0 +1,26 @@ +; RUN: opt -mtriple=x86-linux -load-store-vectorizer -S -o - %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; CHECK-LABEL: @correct_order( +; CHECK: bitcast i32* +; CHECK: load <2 x i32> +; CHECK: load i32 +; CHECK: bitcast i32* +; CHECK: store <2 x i32> +; CHECK: load i32 +define void @correct_order(i32* noalias %ptr) { + %next.gep = getelementptr i32, i32* %ptr, i64 0 + %next.gep1 = getelementptr i32, i32* %ptr, i64 1 + %next.gep2 = getelementptr i32, i32* %ptr, i64 2 + + %l1 = load i32, i32* %next.gep1, align 4 + %l2 = load i32, i32* %next.gep, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + + ret void +} + Index: test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll +++ test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll @@ -6,6 +6,9 @@ ; Check an i32 and i8* get vectorized, and that ; the two accesses (load into buff.val and store to buff.p) preserve their order. +; Vectorized loads should be inserted at the position of the first load, +; and instructions which were between the first and last load should be +; reordered preserving their relative order inasmuch as possible. ; CHECK-LABEL: @preserve_order_32( ; CHECK: load <2 x i32> Index: test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll +++ test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll @@ -6,6 +6,9 @@ ; Check an i64 and i8* get vectorized, and that ; the two accesses (load into buff.val and store to buff.p) preserve their order. +; Vectorized loads should be inserted at the position of the first load, +; and instructions which were between the first and last load should be +; reordered preserving their relative order inasmuch as possible. ; CHECK-LABEL: @preserve_order_64( ; CHECK: load <2 x i64> @@ -22,4 +25,33 @@ ret void } +; Check for no vectorization over phi node + +; CHECK-LABEL: @no_vect_phi( +; CHECK: load i8* +; CHECK: load i8 +; CHECK: store i8 0 +; CHECK: load i64 +define void @no_vect_phi(i32* noalias %ptr, %struct.buffer_t* noalias %buff) { +entry: + %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 1 + %buff.p = load i8*, i8** %tmp1, align 8 + %buff.val = load i8, i8* %buff.p, align 8 + store i8 0, i8* %buff.p, align 8 + br label %"for something" + +"for something": + %index = phi i64 [ 0, %entry ], [ %index.next, %"for something" ] + + %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 0 + %buff.int = load i64, i64* %tmp0, align 8 + + %index.next = add i64 %index, 8 + %cmp_res = icmp eq i64 %index.next, 8 + br i1 %cmp_res, label %ending, label %"for something" + +ending: + ret void +} + attributes #0 = { nounwind } Index: test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll =================================================================== --- /dev/null +++ test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll @@ -0,0 +1,91 @@ +; RUN: opt -mtriple=x86-linux -load-store-vectorizer -S -o - %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; Vectorized subsets of the load/store chains in the presence of +; interleaved loads/stores + +; CHECK-LABEL: @interleave_2L_2S( +; CHECK: load <2 x i32> +; CHECK: load i32 +; CHECK: store <2 x i32> +; CHECK: load i32 +define void @interleave_2L_2S(i32* noalias %ptr) { + %next.gep = getelementptr i32, i32* %ptr, i64 0 + %next.gep1 = getelementptr i32, i32* %ptr, i64 1 + %next.gep2 = getelementptr i32, i32* %ptr, i64 2 + + %l1 = load i32, i32* %next.gep1, align 4 + %l2 = load i32, i32* %next.gep, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + + ret void +} + +; CHECK-LABEL: @interleave_3L_2S_1L( +; CHECK: load <3 x i32> +; CHECK: store <2 x i32> +; CHECK: load i32 + +define void @interleave_3L_2S_1L(i32* noalias %ptr) { + %next.gep = getelementptr i32, i32* %ptr, i64 0 + %next.gep1 = getelementptr i32, i32* %ptr, i64 1 + %next.gep2 = getelementptr i32, i32* %ptr, i64 2 + + %l2 = load i32, i32* %next.gep, align 4 + %l1 = load i32, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + + ret void +} + +; CHECK-LABEL: @chain_suffix( +; CHECK: load i32 +; CHECK: store <2 x i32> +; CHECK: load i32 +; CHECK: load i32 +define void @chain_suffix(i32* noalias %ptr) { + %next.gep = getelementptr i32, i32* %ptr, i64 0 + %next.gep1 = getelementptr i32, i32* %ptr, i64 1 + %next.gep2 = getelementptr i32, i32* %ptr, i64 2 + + %l2 = load i32, i32* %next.gep, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + + ret void +} + + +; CHECK-LABEL: @chain_prefix_suffix( +; CHECK: load i32 +; CHECK: load i32 +; CHECK: store <2 x i32> +; CHECK: load i32 +; CHECK: load i32 +; CHECK: load i32 +define void @chain_prefix_suffix(i32* noalias %ptr) { + %next.gep = getelementptr i32, i32* %ptr, i64 0 + %next.gep1 = getelementptr i32, i32* %ptr, i64 1 + %next.gep2 = getelementptr i32, i32* %ptr, i64 2 + %next.gep3 = getelementptr i32, i32* %ptr, i64 3 + + %l1 = load i32, i32* %next.gep, align 4 + %l2 = load i32, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep2, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + %l5 = load i32, i32* %next.gep3, align 4 + + ret void +} +