Index: llvm/trunk/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -88,8 +88,8 @@ bool isConsecutiveAccess(Value *A, Value *B); - /// Reorders the users of I after vectorization to ensure that I dominates its - /// users. + /// After vectorization, reorder the instructions that I depends on + /// (the instructions defining its operands), to ensure they dominate I. void reorder(Instruction *I); /// Returns the first and the last instructions in Chain. @@ -334,19 +334,37 @@ } void Vectorizer::reorder(Instruction *I) { - Instruction *InsertAfter = I; - for (User *U : I->users()) { - Instruction *User = dyn_cast(U); - if (!User || User->getOpcode() == Instruction::PHI) - continue; + SmallPtrSet InstructionsToMove; + SmallVector Worklist; - if (!DT.dominates(I, User)) { - User->removeFromParent(); - User->insertAfter(InsertAfter); - InsertAfter = User; - reorder(User); + Worklist.push_back(I); + while (!Worklist.empty()) { + Instruction *IW = Worklist.pop_back_val(); + int NumOperands = IW->getNumOperands(); + for (int i = 0; i < NumOperands; i++) { + Instruction *IM = dyn_cast(IW->getOperand(i)); + if (!IM || IM->getOpcode() == Instruction::PHI) + continue; + + if (!DT.dominates(IM, I)) { + InstructionsToMove.insert(IM); + Worklist.push_back(IM); + assert(IM->getParent() == IW->getParent() && + "Instructions to move should be in the same basic block"); + } } } + + // All instructions to move should follow I. Start from I, not from begin(). + for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E; + ++BBI) { + if (!is_contained(InstructionsToMove, &*BBI)) + continue; + Instruction *IM = &*BBI; + --BBI; + IM->removeFromParent(); + IM->insertBefore(I); + } } std::pair @@ -851,7 +869,7 @@ return false; // Set insert point. - Builder.SetInsertPoint(&*Last); + Builder.SetInsertPoint(&*First); Value *Bitcast = Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS)); @@ -863,6 +881,7 @@ if (VecLoadTy) { SmallVector InstrsToErase; SmallVector InstrsToReorder; + InstrsToReorder.push_back(cast(Bitcast)); unsigned VecWidth = VecLoadTy->getNumElements(); for (unsigned I = 0, E = Chain.size(); I != E; ++I) { @@ -878,7 +897,6 @@ // Replace the old instruction. UI->replaceAllUsesWith(Extracted); - InstrsToReorder.push_back(Extracted); InstrsToErase.push_back(UI); } } @@ -890,6 +908,7 @@ I->eraseFromParent(); } else { SmallVector InstrsToReorder; + InstrsToReorder.push_back(cast(Bitcast)); for (unsigned I = 0, E = Chain.size(); I != E; ++I) { Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(I)); @@ -902,7 +921,6 @@ // Replace the old instruction. UI->replaceAllUsesWith(Extracted); - InstrsToReorder.push_back(Extracted); } for (Instruction *ModUser : InstrsToReorder) Index: llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll =================================================================== --- llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll +++ llvm/trunk/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll @@ -2,13 +2,13 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" -; Check relative position of the inserted vector load relative to the -; existing adds. +; Check relative position of the inserted vector load relative to the existing +; adds. Vectorized loads should be inserted at the position of the first load. ; CHECK-LABEL: @insert_load_point( ; CHECK: %z = add i32 %x, 4 -; CHECK: %w = add i32 %y, 9 ; CHECK: load <2 x float> +; CHECK: %w = add i32 %y, 9 ; CHECK: %foo = add i32 %z, %w define void @insert_load_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 { entry: Index: llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll =================================================================== --- llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll +++ llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/correct-order.ll @@ -0,0 +1,26 @@ +; RUN: opt -mtriple=x86-linux -load-store-vectorizer -S -o - %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; CHECK-LABEL: @correct_order( +; CHECK: bitcast i32* +; CHECK: load <2 x i32> +; CHECK: load i32 +; CHECK: bitcast i32* +; CHECK: store <2 x i32> +; CHECK: load i32 +define void @correct_order(i32* noalias %ptr) { + %next.gep = getelementptr i32, i32* %ptr, i64 0 + %next.gep1 = getelementptr i32, i32* %ptr, i64 1 + %next.gep2 = getelementptr i32, i32* %ptr, i64 2 + + %l1 = load i32, i32* %next.gep1, align 4 + %l2 = load i32, i32* %next.gep, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + + ret void +} + Index: llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll =================================================================== --- llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll +++ llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/preserve-order32.ll @@ -4,8 +4,11 @@ %struct.buffer_t = type { i32, i8* } -; Check an i32 and i8* get vectorized, and that -; the two accesses (load into buff.val and store to buff.p) preserve their order. +; Check an i32 and i8* get vectorized, and that the two accesses +; (load into buff.val and store to buff.p) preserve their order. +; Vectorized loads should be inserted at the position of the first load, +; and instructions which were between the first and last load should be +; reordered preserving their relative order inasmuch as possible. ; CHECK-LABEL: @preserve_order_32( ; CHECK: load <2 x i32> Index: llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll =================================================================== --- llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll +++ llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/preserve-order64.ll @@ -3,9 +3,13 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" %struct.buffer_t = type { i64, i8* } +%struct.nested.buffer = type { %struct.buffer_t, %struct.buffer_t } -; Check an i64 and i8* get vectorized, and that -; the two accesses (load into buff.val and store to buff.p) preserve their order. +; Check an i64 and i8* get vectorized, and that the two accesses +; (load into buff.val and store to buff.p) preserve their order. +; Vectorized loads should be inserted at the position of the first load, +; and instructions which were between the first and last load should be +; reordered preserving their relative order inasmuch as possible. ; CHECK-LABEL: @preserve_order_64( ; CHECK: load <2 x i64> @@ -22,4 +26,52 @@ ret void } +; Check reordering recurses correctly. + +; CHECK-LABEL: @transitive_reorder( +; CHECK: load <2 x i64> +; CHECK: %buff.val = load i8 +; CHECK: store i8 0 +define void @transitive_reorder(%struct.buffer_t* noalias %buff, %struct.nested.buffer* noalias %nest) #0 { +entry: + %nest0_0 = getelementptr inbounds %struct.nested.buffer, %struct.nested.buffer* %nest, i64 0, i32 0 + %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %nest0_0, i64 0, i32 1 + %buff.p = load i8*, i8** %tmp1, align 8 + %buff.val = load i8, i8* %buff.p, align 8 + store i8 0, i8* %buff.p, align 8 + %nest1_0 = getelementptr inbounds %struct.nested.buffer, %struct.nested.buffer* %nest, i64 0, i32 0 + %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %nest1_0, i64 0, i32 0 + %buff.int = load i64, i64* %tmp0, align 8 + ret void +} + +; Check for no vectorization over phi node + +; CHECK-LABEL: @no_vect_phi( +; CHECK: load i8* +; CHECK: load i8 +; CHECK: store i8 0 +; CHECK: load i64 +define void @no_vect_phi(i32* noalias %ptr, %struct.buffer_t* noalias %buff) { +entry: + %tmp1 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 1 + %buff.p = load i8*, i8** %tmp1, align 8 + %buff.val = load i8, i8* %buff.p, align 8 + store i8 0, i8* %buff.p, align 8 + br label %"for something" + +"for something": + %index = phi i64 [ 0, %entry ], [ %index.next, %"for something" ] + + %tmp0 = getelementptr inbounds %struct.buffer_t, %struct.buffer_t* %buff, i64 0, i32 0 + %buff.int = load i64, i64* %tmp0, align 8 + + %index.next = add i64 %index, 8 + %cmp_res = icmp eq i64 %index.next, 8 + br i1 %cmp_res, label %ending, label %"for something" + +ending: + ret void +} + attributes #0 = { nounwind } Index: llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll =================================================================== --- llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll +++ llvm/trunk/test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll @@ -0,0 +1,91 @@ +; RUN: opt -mtriple=x86-linux -load-store-vectorizer -S -o - %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; Vectorized subsets of the load/store chains in the presence of +; interleaved loads/stores + +; CHECK-LABEL: @interleave_2L_2S( +; CHECK: load <2 x i32> +; CHECK: load i32 +; CHECK: store <2 x i32> +; CHECK: load i32 +define void @interleave_2L_2S(i32* noalias %ptr) { + %next.gep = getelementptr i32, i32* %ptr, i64 0 + %next.gep1 = getelementptr i32, i32* %ptr, i64 1 + %next.gep2 = getelementptr i32, i32* %ptr, i64 2 + + %l1 = load i32, i32* %next.gep1, align 4 + %l2 = load i32, i32* %next.gep, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + + ret void +} + +; CHECK-LABEL: @interleave_3L_2S_1L( +; CHECK: load <3 x i32> +; CHECK: store <2 x i32> +; CHECK: load i32 + +define void @interleave_3L_2S_1L(i32* noalias %ptr) { + %next.gep = getelementptr i32, i32* %ptr, i64 0 + %next.gep1 = getelementptr i32, i32* %ptr, i64 1 + %next.gep2 = getelementptr i32, i32* %ptr, i64 2 + + %l2 = load i32, i32* %next.gep, align 4 + %l1 = load i32, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + + ret void +} + +; CHECK-LABEL: @chain_suffix( +; CHECK: load i32 +; CHECK: store <2 x i32> +; CHECK: load i32 +; CHECK: load i32 +define void @chain_suffix(i32* noalias %ptr) { + %next.gep = getelementptr i32, i32* %ptr, i64 0 + %next.gep1 = getelementptr i32, i32* %ptr, i64 1 + %next.gep2 = getelementptr i32, i32* %ptr, i64 2 + + %l2 = load i32, i32* %next.gep, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + + ret void +} + + +; CHECK-LABEL: @chain_prefix_suffix( +; CHECK: load i32 +; CHECK: load i32 +; CHECK: store <2 x i32> +; CHECK: load i32 +; CHECK: load i32 +; CHECK: load i32 +define void @chain_prefix_suffix(i32* noalias %ptr) { + %next.gep = getelementptr i32, i32* %ptr, i64 0 + %next.gep1 = getelementptr i32, i32* %ptr, i64 1 + %next.gep2 = getelementptr i32, i32* %ptr, i64 2 + %next.gep3 = getelementptr i32, i32* %ptr, i64 3 + + %l1 = load i32, i32* %next.gep, align 4 + %l2 = load i32, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep1, align 4 + store i32 0, i32* %next.gep2, align 4 + %l3 = load i32, i32* %next.gep1, align 4 + %l4 = load i32, i32* %next.gep2, align 4 + %l5 = load i32, i32* %next.gep3, align 4 + + ret void +} +