Index: lib/Transforms/Vectorize/LoadStoreVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -628,7 +628,7 @@ bool Vectorizer::vectorizeInstructions(ArrayRef Instrs) { DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size() << " instructions.\n"); - SmallSetVector Heads, Tails; + SmallVector Heads, Tails; int ConsecutiveChain[64]; // Do a quadratic search on all of the given stores and find all of the pairs @@ -647,8 +647,8 @@ continue; // Should not insert. } - Tails.insert(j); - Heads.insert(i); + Tails.push_back(j); + Heads.push_back(i); ConsecutiveChain[i] = j; } } @@ -660,21 +660,21 @@ for (int Head : Heads) { if (InstructionsProcessed.count(Instrs[Head])) continue; - bool longerChainExists = false; + bool LongerChainExists = false; for (unsigned TIt = 0; TIt < Tails.size(); TIt++) if (Head == Tails[TIt] && !InstructionsProcessed.count(Instrs[Heads[TIt]])) { - longerChainExists = true; + LongerChainExists = true; break; } - if (longerChainExists) + if (LongerChainExists) continue; // We found an instr that starts a chain. Now follow the chain and try to // vectorize it. SmallVector Operands; int I = Head; - while (I != -1 && (Tails.count(I) || Heads.count(I))) { + while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) { if (InstructionsProcessed.count(Instrs[I])) break; Index: test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll =================================================================== --- /dev/null +++ test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll @@ -0,0 +1,64 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -basicaa -load-store-vectorizer -S -o - %s | FileCheck %s + +target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" + +; Checks that there is no crash when there are multiple tails +; for a the same head starting a chain. +@0 = internal addrspace(3) global [16384 x i32] undef + +; CHECK-LABEL: @no_crash( +; CHECK: store <2 x i32> zeroinitializer +; CHECK: store i32 0 +; CHECK: store i32 0 + +define void @no_crash(i32 %arg) { + %tmp2 = add i32 %arg, 14 + %tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp2 + %tmp4 = add i32 %arg, 15 + %tmp5 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp4 + + store i32 0, i32 addrspace(3)* %tmp3, align 4 + store i32 0, i32 addrspace(3)* %tmp5, align 4 + store i32 0, i32 addrspace(3)* %tmp5, align 4 + store i32 0, i32 addrspace(3)* %tmp5, align 4 + + ret void +} + +; Check adjiacent memory locations are properly matched and the +; longest chain vectorized + +; CHECK-LABEL: @interleave_get_longest +; CHECK: load <2 x i32> +; CHECK: load i32 +; CHECK: store <2 x i32> zeroinitializer +; CHECK: load i32 +; CHECK: load <2 x i32> +; CHECK: load i32 +; CHECK: load i32 + +define void @interleave_get_longest(i32 %arg) { + %a1 = add i32 %arg, 1 + %a2 = add i32 %arg, 2 + %a3 = add i32 %arg, 3 + %a4 = add i32 %arg, 4 + %tmp1 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %arg + %tmp2 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a1 + %tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a2 + %tmp4 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a3 + %tmp5 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %a4 + + %l1 = load i32, i32 addrspace(3)* %tmp2, align 4 + %l2 = load i32, i32 addrspace(3)* %tmp1, align 4 + store i32 0, i32 addrspace(3)* %tmp2, align 4 + store i32 0, i32 addrspace(3)* %tmp1, align 4 + %l3 = load i32, i32 addrspace(3)* %tmp2, align 4 + %l4 = load i32, i32 addrspace(3)* %tmp3, align 4 + %l5 = load i32, i32 addrspace(3)* %tmp4, align 4 + %l6 = load i32, i32 addrspace(3)* %tmp5, align 4 + %l7 = load i32, i32 addrspace(3)* %tmp5, align 4 + %l8 = load i32, i32 addrspace(3)* %tmp5, align 4 + + ret void +} + Index: test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll =================================================================== --- test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll +++ test/Transforms/LoadStoreVectorizer/X86/subchain-interleaved.ll @@ -85,3 +85,33 @@ ret void } +; FIXME: If the chain is too long and TLI says misaligned is not fast, +; then LSV fails to vectorize anything in that chain. +; To reproduce below, add a tmp5 (ptr+4) and load tmp5 into l6 and l7. + +; CHECK-LABEL: @interleave_get_longest +; CHECK: load <3 x i32> +; CHECK: load i32 +; CHECK: store <2 x i32> zeroinitializer +; CHECK: load i32 +; CHECK: load i32 +; CHECK: load i32 + +define void @interleave_get_longest(i32* noalias %ptr) { + %tmp1 = getelementptr i32, i32* %ptr, i64 0 + %tmp2 = getelementptr i32, i32* %ptr, i64 1 + %tmp3 = getelementptr i32, i32* %ptr, i64 2 + %tmp4 = getelementptr i32, i32* %ptr, i64 3 + + %l1 = load i32, i32* %tmp2, align 4 + %l2 = load i32, i32* %tmp1, align 4 + store i32 0, i32* %tmp2, align 4 + store i32 0, i32* %tmp1, align 4 + %l3 = load i32, i32* %tmp2, align 4 + %l4 = load i32, i32* %tmp3, align 4 + %l5 = load i32, i32* %tmp4, align 4 + %l6 = load i32, i32* %tmp4, align 4 + %l7 = load i32, i32* %tmp4, align 4 + + ret void +}