Index: lib/Transforms/Vectorize/LoadStoreVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -90,7 +90,9 @@ /// Reorders the users of I after vectorization to ensure that I dominates its /// users. - void reorder(Instruction *I); + void reorderBefore(Instruction *I); + void reorderBeforeHelper(Instruction *I, + SmallPtrSet &InstructionsToMove); /// Returns the first and the last instructions in Chain. std::pair @@ -333,22 +335,50 @@ return X2 == OffsetSCEVB; } -void Vectorizer::reorder(Instruction *I) { - Instruction *InsertAfter = I; - for (User *U : I->users()) { - Instruction *User = dyn_cast(U); - if (!User || User->getOpcode() == Instruction::PHI) - continue; +void Vectorizer::reorderBeforeHelper( + Instruction *I, SmallPtrSet &InstructionsToMove) { + + SmallVector Worklist; + Worklist.insert(Worklist.end(), I); + while (Worklist.size()) { + auto LastElement = Worklist.end(); + --LastElement; + I = *LastElement; + Worklist.erase(LastElement); + int NumOperands = I->getNumOperands(); + for (int i = 0; i < NumOperands; i++) { + Instruction *IM = dyn_cast(I->getOperand(i)); + if (!IM || IM->getOpcode() == Instruction::PHI) + continue; - if (!DT.dominates(I, User)) { - User->removeFromParent(); - User->insertAfter(InsertAfter); - InsertAfter = User; - reorder(User); + if (!DT.dominates(IM, I)) { + InstructionsToMove.insert(IM); + Worklist.insert(Worklist.end(), IM); + } } } } +void Vectorizer::reorderBefore(Instruction *I) { + SmallPtrSet InstructionsToMove; + reorderBeforeHelper(I, InstructionsToMove); + + for (auto IM = I->getParent()->begin(), E = I->getParent()->end(); IM != E; + ++IM) { + if (!is_contained(InstructionsToMove, &*IM)) + continue; + Instruction *InstructionToMove = &*IM; + --IM; + InstructionToMove->removeFromParent(); + InstructionToMove->insertBefore(I); + InstructionsToMove.erase(InstructionToMove); + } + // This failure can occur if the helper marks for reordering instructions + // outside this BB + assert(InstructionsToMove.size() == 0 && + "Failed to reorder all instructions"); +} + std::pair Vectorizer::getBoundaryInstrs(ArrayRef Chain) { Instruction *C0 = cast(Chain[0]); @@ -857,7 +887,7 @@ return false; // Set insert point. - Builder.SetInsertPoint(&*Last); + Builder.SetInsertPoint(&*First); Value *Bitcast = Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS)); @@ -869,6 +899,7 @@ if (VecLoadTy) { SmallVector InstrsToErase; SmallVector InstrsToReorder; + InstrsToReorder.push_back(cast(Bitcast)); unsigned VecWidth = VecLoadTy->getNumElements(); for (unsigned I = 0, E = Chain.size(); I != E; ++I) { @@ -884,18 +915,18 @@ // Replace the old instruction. UI->replaceAllUsesWith(Extracted); - InstrsToReorder.push_back(Extracted); InstrsToErase.push_back(UI); } } for (Instruction *ModUser : InstrsToReorder) - reorder(ModUser); + reorderBefore(ModUser); for (auto I : InstrsToErase) I->eraseFromParent(); } else { SmallVector InstrsToReorder; + InstrsToReorder.push_back(cast(Bitcast)); for (unsigned I = 0, E = Chain.size(); I != E; ++I) { Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(I)); @@ -908,11 +939,10 @@ // Replace the old instruction. UI->replaceAllUsesWith(Extracted); - InstrsToReorder.push_back(Extracted); } for (Instruction *ModUser : InstrsToReorder) - reorder(ModUser); + reorderBefore(ModUser); } eraseInstructions(Chain); @@ -930,4 +960,3 @@ Alignment, &Fast); return Allows && Fast; } - Index: test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll +++ test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll @@ -4,11 +4,12 @@ ; Check relative position of the inserted vector load relative to the ; existing adds. +; Vectorized loads should be inserted at the position of the first load. ; CHECK-LABEL: @insert_load_point( ; CHECK: %z = add i32 %x, 4 -; CHECK: %w = add i32 %y, 9 ; CHECK: load <2 x float> +; CHECK: %w = add i32 %y, 9 ; CHECK: %foo = add i32 %z, %w define void @insert_load_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 { entry: Index: test/Transforms/LoadStoreVectorizer/X86/correct-order.ll =================================================================== --- /dev/null +++ test/Transforms/LoadStoreVectorizer/X86/correct-order.ll @@ -0,0 +1,26 @@ +; RUN: opt -mtriple=x86-linux -load-store-vectorizer -S -o - %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; CHECK-LABEL: @correct_order( +; CHECK: bitcast i32* +; CHECK: load <2 x i32> +; CHECK: load i32 +; CHECK: bitcast i32* +; CHECK: store <2 x i32> +; CHECK: load i32 +define void @correct_order(i32* noalias %ptr) { + %next.gep = getelementptr i32, i32* %ptr, i64 0 + %next.gep1 = getelementptr i32, i32* %ptr, i64 1 + %next.gep2 = getelementptr i32, i32* %ptr, i64 2 + + %l1 = load i32, i32* %next.gep1, align 4 + %l2 = load i32, i32* %next.gep, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + + ret void +} + Index: test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll +++ test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll @@ -6,6 +6,9 @@ ; Check an i32 and i8* get vectorized, and that ; the two accesses (load into buff.val and store to buff.p) preserve their order. +; Vectorized loads should be inserted at the position of the first load, +; and instructions which were between the first and last load should be +; reordered preserving their relative order inasmuch as possible. ; CHECK-LABEL: @preserve_order_32( ; CHECK: load <2 x i32> Index: test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll +++ test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll @@ -6,6 +6,9 @@ ; Check an i64 and i8* get vectorized, and that ; the two accesses (load into buff.val and store to buff.p) preserve their order. +; Vectorized loads should be inserted at the position of the first load, +; and instructions which were between the first and last load should be +; reordered preserving their relative order inasmuch as possible. ; CHECK-LABEL: @preserve_order_64( ; CHECK: load <2 x i64> @@ -22,4 +25,33 @@ ret void } +; Check for no vectorization over phi node + +; CHECK-LABEL: @no_vect_phi( +; CHECK: load i8* +; CHECK: load i8 +; CHECK: store i8 0 +; CHECK: load i64 +define void @no_vect_phi(i32* noalias %ptr, %struct.buffer_t* noalias %buff) { +entry: + %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 1 + %buff.p = load i8*, i8** %tmp1, align 8 + %buff.val = load i8, i8* %buff.p, align 8 + store i8 0, i8* %buff.p, align 8 + br label %"for something" + +"for something": + %index = phi i64 [ 0, %entry ], [ %index.next, %"for something" ] + + %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 0 + %buff.int = load i64, i64* %tmp0, align 8 + + %index.next = add i64 %index, 8 + %cmp_res = icmp eq i64 %index.next, 8 + br i1 %cmp_res, label %ending, label %"for something" + +ending: + ret void +} + attributes #0 = { nounwind } Index: test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll =================================================================== --- /dev/null +++ test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll @@ -0,0 +1,91 @@ +; RUN: opt -mtriple=x86-linux -load-store-vectorizer -S -o - %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; Vectorized subsets of the load/store chains in the presence of +; interleaved loads/stores + +; CHECK-LABEL: @interleave_2L_2S( +; CHECK: load <2 x i32> +; CHECK: load i32 +; CHECK: store <2 x i32> +; CHECK: load i32 +define void @interleave_2L_2S(i32* noalias %ptr) { + %next.gep = getelementptr i32, i32* %ptr, i64 0 + %next.gep1 = getelementptr i32, i32* %ptr, i64 1 + %next.gep2 = getelementptr i32, i32* %ptr, i64 2 + + %l1 = load i32, i32* %next.gep1, align 4 + %l2 = load i32, i32* %next.gep, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + + ret void +} + +; CHECK-LABEL: @interleave_3L_2S_1L( +; CHECK: load <3 x i32> +; CHECK: store <2 x i32> +; CHECK: load i32 + +define void @interleave_3L_2S_1L(i32* noalias %ptr) { + %next.gep = getelementptr i32, i32* %ptr, i64 0 + %next.gep1 = getelementptr i32, i32* %ptr, i64 1 + %next.gep2 = getelementptr i32, i32* %ptr, i64 2 + + %l2 = load i32, i32* %next.gep, align 4 + %l1 = load i32, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + + ret void +} + +; CHECK-LABEL: @chain_suffix( +; CHECK: load i32 +; CHECK: store <2 x i32> +; CHECK: load i32 +; CHECK: load i32 +define void @chain_suffix(i32* noalias %ptr) { + %next.gep = getelementptr i32, i32* %ptr, i64 0 + %next.gep1 = getelementptr i32, i32* %ptr, i64 1 + %next.gep2 = getelementptr i32, i32* %ptr, i64 2 + + %l2 = load i32, i32* %next.gep, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + + ret void +} + + +; CHECK-LABEL: @chain_prefix_suffix( +; CHECK: load i32 +; CHECK: load i32 +; CHECK: store <2 x i32> +; CHECK: load i32 +; CHECK: load i32 +; CHECK: load i32 +define void @chain_prefix_suffix(i32* noalias %ptr) { + %next.gep = getelementptr i32, i32* %ptr, i64 0 + %next.gep1 = getelementptr i32, i32* %ptr, i64 1 + %next.gep2 = getelementptr i32, i32* %ptr, i64 2 + %next.gep3 = getelementptr i32, i32* %ptr, i64 3 + + %l1 = load i32, i32* %next.gep, align 4 + %l2 = load i32, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep2, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + %l5 = load i32, i32* %next.gep3, align 4 + + ret void +} +