Index: lib/Transforms/Vectorize/LoadStoreVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -32,6 +32,8 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include + using namespace llvm; #define DEBUG_TYPE "load-store-vectorizer" @@ -91,7 +93,10 @@ /// Reorders the users of I after vectorization to ensure that I dominates its /// users. - void reorder(Instruction *I); + void reorderAfter(Instruction *I); + void reorderAfterHelper(Instruction* I, std::unordered_set& InstructionsToMove); + void reorderBefore(Instruction *I); + void reorderBeforeHelper(Instruction* I, std::unordered_set& InstructionsToMove); /// Returns the first and the last instructions in Chain. std::pair @@ -334,22 +339,69 @@ return X2 == OffsetSCEVB; } -void Vectorizer::reorder(Instruction *I) { - Instruction *InsertAfter = I; +void Vectorizer::reorderAfterHelper(Instruction* I, + std::unordered_set &InstructionsToMove) { for (User *U : I->users()) { - Instruction *User = dyn_cast(U); - if (!User || User->getOpcode() == Instruction::PHI) + Instruction *IM = dyn_cast(U); + if (!IM || IM->getOpcode() == Instruction::PHI) + continue; + + if (!DT.dominates(I, IM)) { + InstructionsToMove.insert(IM); + reorderAfterHelper(IM, InstructionsToMove); + } + } +} + +void Vectorizer::reorderAfter(Instruction *I) { + std::unordered_set InstructionsToMove; + reorderAfterHelper(I, InstructionsToMove); + + Instruction* InsertAfter=I; + for (auto IM = I->getParent()->begin(), E = I->getParent()->end(); IM != E; ++IM) { + if (!is_contained(InstructionsToMove, &*IM)) + continue; + Instruction* InstructionToMove = &*IM; + --IM; + InstructionToMove->removeFromParent(); + InstructionToMove->insertAfter(InsertAfter); + InstructionsToMove.erase(&*InstructionToMove); + InsertAfter=InstructionToMove; + } +} + +void Vectorizer::reorderBeforeHelper(Instruction* I, + std::unordered_set &InstructionsToMove) { + + int NumOperands = I->getNumOperands(); + for (int i = 0; i < NumOperands; i++) { + Instruction *IM = dyn_cast(I->getOperand(i)); + if (!IM || IM->getOpcode() == Instruction::PHI) continue; - if (!DT.dominates(I, User)) { - User->removeFromParent(); - User->insertAfter(InsertAfter); - InsertAfter = User; - reorder(User); + if (!DT.dominates(IM, I)) { + InstructionsToMove.insert(IM); + reorderBeforeHelper(IM, InstructionsToMove); } } } +void Vectorizer::reorderBefore(Instruction *I) { + std::unordered_set InstructionsToMove; + reorderBeforeHelper(I, InstructionsToMove); + + for (auto IM = I->getParent()->begin(), E = I->getParent()->end(); IM != E; ++IM) { + if (!is_contained(InstructionsToMove, &*IM)) + continue; + Instruction* InstructionToMove = &*IM; + --IM; + InstructionToMove->removeFromParent(); + InstructionToMove->insertBefore(I); + InstructionsToMove.erase(&*InstructionToMove); + } +} + + std::pair Vectorizer::getBoundaryInstrs(ArrayRef Chain) { Instruction *C0 = cast(Chain[0]); @@ -861,7 +913,7 @@ return false; // Set insert point. - Builder.SetInsertPoint(&*Last); + Builder.SetInsertPoint(&*First); Value *Bitcast = Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS)); @@ -873,6 +925,7 @@ if (VecLoadTy) { SmallVector InstrsToErase; SmallVector InstrsToReorder; + InstrsToReorder.push_back(dyn_cast(Bitcast)); unsigned VecWidth = VecLoadTy->getNumElements(); for (unsigned I = 0, E = Chain.size(); I != E; ++I) { @@ -888,18 +941,18 @@ // Replace the old instruction. UI->replaceAllUsesWith(Extracted); - InstrsToReorder.push_back(Extracted); InstrsToErase.push_back(UI); } } for (Instruction *ModUser : InstrsToReorder) - reorder(ModUser); + reorderBefore(ModUser); for (auto I : InstrsToErase) I->eraseFromParent(); } else { SmallVector InstrsToReorder; + InstrsToReorder.push_back(dyn_cast(Bitcast)); for (unsigned I = 0, E = Chain.size(); I != E; ++I) { Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(I)); @@ -912,11 +965,10 @@ // Replace the old instruction. UI->replaceAllUsesWith(Extracted); - InstrsToReorder.push_back(Extracted); } for (Instruction *ModUser : InstrsToReorder) - reorder(ModUser); + reorderBefore(ModUser); } eraseInstructions(Chain); Index: test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll @@ -7,8 +7,8 @@ ; CHECK-LABEL: @insert_load_point( ; CHECK: %z = add i32 %x, 4 -; CHECK: %w = add i32 %y, 9 ; CHECK: load <2 x float> +; CHECK: %w = add i32 %y, 9 ; CHECK: %foo = add i32 %z, %w define void @insert_load_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 { entry: Index: test/Transforms/LoadStoreVectorizer/X86/correct-order.ll =================================================================== --- /dev/null +++ test/Transforms/LoadStoreVectorizer/X86/correct-order.ll @@ -0,0 +1,36 @@ +; RUN: opt -mtriple=x86-linux -load-store-vectorizer -S -o - %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; CHECK-LABEL: @correct_order( +; CHECK: bitcast i32* +; CHECK: load <2 x i32> +; CHECK: load i32 +; CHECK: bitcast i32* +; CHECK: store <2 x i32> +; CHECK: load i32 +define void @correct_order(i32* noalias %ptr) { +entry: + br label %"for something" + +"for something": + %index = phi i64 [ 0, %entry ], [ %index.next, %"for something" ] + %next.gep = getelementptr i32, i32* %ptr, i64 %index + %a1 = add nsw i64 %index, 1 + %next.gep1 = getelementptr i32, i32* %ptr, i64 %a1 + %a2 = add nsw i64 %index, 2 + %next.gep2 = getelementptr i32, i32* %ptr, i64 %a2 + + %l1 = load i32, i32* %next.gep1, align 4 + %l2 = load i32, i32* %next.gep, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + %index.next = add i64 %index, 8 + %cmp_res = icmp eq i64 %index.next, 8 + br i1 %cmp_res, label %ending, label %"for something" + +ending: + ret void +} Index: test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll =================================================================== --- /dev/null +++ test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll @@ -0,0 +1,140 @@ +; RUN: opt -mtriple=x86-linux -load-store-vectorizer -S -o - %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; Vectorized subsets of the load/store chains in the presence of +; interleaved loads/stores + +; CHECK-LABEL: @interleave_2L_2S( +; CHECK: load <2 x i32> +; CHECK: load i32 +; CHECK: store <2 x i32> +; CHECK: load i32 +define void @interleave_2L_2S(i32* noalias %ptr) { +entry: + br label %"for something" + +"for something": + %index = phi i64 [ 0, %entry ], [ %index.next, %"for something" ] + %next.gep = getelementptr i32, i32* %ptr, i64 %index + %a1 = add nsw i64 %index, 1 + %next.gep1 = getelementptr i32, i32* %ptr, i64 %a1 + %a2 = add nsw i64 %index, 2 + %next.gep2 = getelementptr i32, i32* %ptr, i64 %a2 + + %l1 = load i32, i32* %next.gep1, align 4 + %l2 = load i32, i32* %next.gep, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + + %index.next = add i64 %index, 8 + %cmp_res = icmp eq i64 %index.next, 8 + br i1 %cmp_res, label %ending, label %"for something" + +ending: + ret void +} + +; CHECK-LABEL: @interleave_3L_2S_1L( +; CHECK: load <3 x i32> +; CHECK: store <2 x i32> +; CHECK: load i32 + +define void @interleave_3L_2S_1L(i32* noalias %ptr) { +entry: + br label %"for something" + +"for something": + %index = phi i64 [ 0, %entry ], [ %index.next, %"for something" ] + %next.gep = getelementptr i32, i32* %ptr, i64 %index + %a1 = add nsw i64 %index, 1 + %next.gep1 = getelementptr i32, i32* %ptr, i64 %a1 + %a2 = add nsw i64 %index, 2 + %next.gep2 = getelementptr i32, i32* %ptr, i64 %a2 + + %l2 = load i32, i32* %next.gep, align 4 + %l1 = load i32, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + + %index.next = add i64 %index, 8 + %cmp_res = icmp eq i64 %index.next, 8 + br i1 %cmp_res, label %ending, label %"for something" + +ending: + ret void +} + +; CHECK-LABEL: @chain_suffix( +; CHECK: load i32 +; CHECK: store <2 x i32> +; CHECK: load i32 +; CHECK: load i32 +define void @chain_suffix(i32* noalias %ptr) { +entry: + br label %"for something" + +"for something": + %index = phi i64 [ 0, %entry ], [ %index.next, %"for something" ] + %next.gep = getelementptr i32, i32* %ptr, i64 %index + %a1 = add nsw i64 %index, 1 + %next.gep1 = getelementptr i32, i32* %ptr, i64 %a1 + %a2 = add nsw i64 %index, 2 + %next.gep2 = getelementptr i32, i32* %ptr, i64 %a2 + + %l2 = load i32, i32* %next.gep, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + + %index.next = add i64 %index, 8 + %cmp_res = icmp eq i64 %index.next, 8 + br i1 %cmp_res, label %ending, label %"for something" + +ending: + ret void +} + + +; CHECK-LABEL: @chain_prefix_suffix( +; CHECK: load i32 +; CHECK: load i32 +; CHECK: store <2 x i32> +; CHECK: load i32 +; CHECK: load i32 +; CHECK: load i32 +define void @chain_prefix_suffix(i32* noalias %ptr) { +entry: + br label %"for something" + +"for something": + %index = phi i64 [ 0, %entry ], [ %index.next, %"for something" ] + %next.gep = getelementptr i32, i32* %ptr, i64 %index + %a1 = add nsw i64 %index, 1 + %next.gep1 = getelementptr i32, i32* %ptr, i64 %a1 + %a2 = add nsw i64 %index, 2 + %next.gep2 = getelementptr i32, i32* %ptr, i64 %a2 + %a3 = add nsw i64 %index, 3 + %next.gep3 = getelementptr i32, i32* %ptr, i64 %a3 + + %l1 = load i32, i32* %next.gep, align 4 + %l2 = load i32, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep2, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + %l5 = load i32, i32* %next.gep3, align 4 + + %index.next = add i64 %index, 8 + %cmp_res = icmp eq i64 %index.next, 8 + br i1 %cmp_res, label %ending, label %"for something" + +ending: + ret void +} +