diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -693,7 +693,37 @@ } }); + // Check atomic barrier where regular loads/stores cannot be freely moved + // across. + // TODO: Certain load or store movement is still allowed for acquire or + // release order. + auto IsAtomicBarrier = [](Instruction *I) { + if (!I->isAtomic()) + return false; + if (auto *LD = dyn_cast(I)) + return isStrongerThanMonotonic(LD->getOrdering()); + if (auto *ST = dyn_cast(I)) + return isStrongerThanMonotonic(ST->getOrdering()); + if (auto *RMW = dyn_cast(I)) + return isStrongerThanMonotonic(RMW->getOrdering()); + if (auto *CmpXchg = dyn_cast(I)) + return isStrongerThanMonotonic(CmpXchg->getMergedOrdering()); + // Atomic instructions like fence has order at least as strong as acquire. + // Treat them and other unknown atomic ops conservatively. + return true; + }; + for (Instruction &I : make_range(getBoundaryInstrs(Chain))) { + if (I.isAtomic()) { + if (IsAtomicBarrier(&I)) { + LLVM_DEBUG(dbgs() << "LSV: Found atomic barrier: " << I << '\n'); + break; + } + // Regular loads/stores could be moved around them, but extra aliasing + // check is required. + MemoryInstrs.push_back(&I); + continue; + } if (isa(I) || isa(I)) { if (!is_contained(Chain, &I)) MemoryInstrs.push_back(&I); @@ -767,13 +797,24 @@ if (!AA.isNoAlias(MemoryLocation::get(MemInstr), MemoryLocation::get(ChainInstr))) { LLVM_DEBUG({ + auto getPointerOperand = [](const Instruction *I) -> const Value * { + if (auto *Load = dyn_cast(I)) + return Load->getPointerOperand(); + if (auto *Store = dyn_cast(I)) + return Store->getPointerOperand(); + if (auto *RMW = dyn_cast(I)) + return RMW->getPointerOperand(); + if (auto *CmpXchg = dyn_cast(I)) + return CmpXchg->getPointerOperand(); + return nullptr; + }; dbgs() << "LSV: Found alias:\n" " Aliasing instruction and pointer:\n" << " " << *MemInstr << '\n' - << " " << *getLoadStorePointerOperand(MemInstr) << '\n' + << " " << *getPointerOperand(MemInstr) << '\n' << " Aliased instruction and pointer:\n" << " " << *ChainInstr << '\n' - << " " << *getLoadStorePointerOperand(ChainInstr) << '\n'; + << " " << *getPointerOperand(ChainInstr) << '\n'; }); // Save this aliasing memory instruction as a barrier, but allow other // instructions that precede the barrier to be vectorized with this one. diff --git a/llvm/test/Transforms/LoadStoreVectorizer/atomic.ll b/llvm/test/Transforms/LoadStoreVectorizer/atomic.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/atomic.ll @@ -0,0 +1,237 @@ +; RUN: opt -S < %s -load-store-vectorizer | FileCheck %s +; RUN: opt -S < %s -passes='function(load-store-vectorizer)' | FileCheck %s + +; load-store vectorization across atomic ops if its order not stronger than +; monotonic. + +; CHECK-LABEL: t0 +; CHECK: load <4 x float> +; CHECK: store <4 x float> +define void @t0(float* noalias %p, i32* noalias %q) { + %q0 = getelementptr i32, i32* %q, i64 0 + %q1 = getelementptr i32, i32* %q, i64 1 + %p0 = getelementptr float, float* %p, i64 0 + %p1 = getelementptr float, float* %p, i64 1 + %p2 = getelementptr float, float* %p, i64 2 + %p3 = getelementptr float, float* %p, i64 3 + %l0 = load float, float* %p0, align 16 + %l1 = load float, float* %p1 + %l2 = load float, float* %p2 + %a0 = load atomic i32, i32* %q0 monotonic, align 4 + %l3 = load float, float* %p3 + store float %l0, float* %p0, align 16 + %a1 = load atomic i32, i32* %q1 monotonic, align 4 + store float %l1, float* %p1 + store float %l2, float* %p2 + store float %l3, float* %p3 + ret void +} + +; CHECK-LABEL: f0 +; CHECK-NOT: load <4 x float> +; CHECK-NOT: store <4 x float> +; CHECK: ret void +define void @f0(float* noalias %p, i32* noalias %q) { + %q0 = getelementptr i32, i32* %q, i64 0 + %q1 = getelementptr i32, i32* %q, i64 1 + %p0 = getelementptr float, float* %p, i64 0 + %p1 = getelementptr float, float* %p, i64 1 + %p2 = getelementptr float, float* %p, i64 2 + %p3 = getelementptr float, float* %p, i64 3 + %l0 = load float, float* %p0, align 16 + %l1 = load float, float* %p1 + %l2 = load float, float* %p2 + %a0 = load atomic i32, i32* %q0 acquire, align 4 + %l3 = load float, float* %p3 + store float %l0, float* %p0, align 16 + %a1 = load atomic i32, i32* %q1 acquire, align 4 + store float %l1, float* %p1 + store float %l2, float* %p2 + store float %l3, float* %p3 + ret void +} + +; CHECK-LABEL: t1 +; CHECK: load <4 x float> +; CHECK: store <4 x float> +define void @t1(float* noalias %p, i32* noalias %q) { + %q0 = getelementptr i32, i32* %q, i64 0 + %q1 = getelementptr i32, i32* %q, i64 1 + %p0 = getelementptr float, float* %p, i64 0 + %p1 = getelementptr float, float* %p, i64 1 + %p2 = getelementptr float, float* %p, i64 2 + %p3 = getelementptr float, float* %p, i64 3 + %l0 = load float, float* %p0, align 16 + %l1 = load float, float* %p1 + %l2 = load float, float* %p2 + store atomic i32 0, i32* %q0 monotonic, align 4 + %l3 = load float, float* %p3 + store float %l0, float* %p0, align 16 + store atomic i32 1, i32* %q1 monotonic, align 4 + store float %l1, float* %p1 + store float %l2, float* %p2 + store float %l3, float* %p3 + ret void +} + +; CHECK-LABEL: f1 +; CHECK-NOT: load <4 x float> +; CHECK-NOT: store <4 x float> +; CHECK: ret void +define void @f1(float* noalias %p, i32* noalias %q) { + %q0 = getelementptr i32, i32* %q, i64 0 + %q1 = getelementptr i32, i32* %q, i64 1 + %p0 = getelementptr float, float* %p, i64 0 + %p1 = getelementptr float, float* %p, i64 1 + %p2 = getelementptr float, float* %p, i64 2 + %p3 = getelementptr float, float* %p, i64 3 + %l0 = load float, float* %p0, align 16 + %l1 = load float, float* %p1 + %l2 = load float, float* %p2 + store atomic i32 0, i32* %q0 release, align 4 + %l3 = load float, float* %p3 + store float %l0, float* %p0, align 16 + store atomic i32 1, i32* %q1 release, align 4 + store float %l1, float* %p1 + store float %l2, float* %p2 + store float %l3, float* %p3 + ret void +} + +; CHECK-LABEL: t2 +; CHECK: load <4 x float> +; CHECK: store <4 x float> +define void @t2(float* noalias %p, i32* noalias %q) { + %q0 = getelementptr i32, i32* %q, i64 0 + %q1 = getelementptr i32, i32* %q, i64 1 + %p0 = getelementptr float, float* %p, i64 0 + %p1 = getelementptr float, float* %p, i64 1 + %p2 = getelementptr float, float* %p, i64 2 + %p3 = getelementptr float, float* %p, i64 3 + %l0 = load float, float* %p0, align 16 + %l1 = load float, float* %p1 + %l2 = load float, float* %p2 + %a0 = atomicrmw add i32* %q0, i32 1 monotonic + %l3 = load float, float* %p3 + store float %l0, float* %p0, align 16 + %a1 = atomicrmw add i32* %q1, i32 2 monotonic + store float %l1, float* %p1 + store float %l2, float* %p2 + store float %l3, float* %p3 + ret void +} + +; CHECK-LABEL: f2 +; CHECK-NOT: load <4 x float> +; CHECK-NOT: store <4 x float> +; CHECK: ret void +define void @f2(float* noalias %p, i32* noalias %q) { + %q0 = getelementptr i32, i32* %q, i64 0 + %q1 = getelementptr i32, i32* %q, i64 1 + %p0 = getelementptr float, float* %p, i64 0 + %p1 = getelementptr float, float* %p, i64 1 + %p2 = getelementptr float, float* %p, i64 2 + %p3 = getelementptr float, float* %p, i64 3 + %l0 = load float, float* %p0, align 16 + %l1 = load float, float* %p1 + %l2 = load float, float* %p2 + %a0 = atomicrmw add i32* %q0, i32 1 acq_rel + %l3 = load float, float* %p3 + store float %l0, float* %p0, align 16 + %a1 = atomicrmw add i32* %q1, i32 2 acq_rel + store float %l1, float* %p1 + store float %l2, float* %p2 + store float %l3, float* %p3 + ret void +} + +; CHECK-LABEL: g2 +; CHECK-NOT: load <4 x i32> +; CHECK-NOT: store <4 x i32> +; CHECK: ret void +define void @g2(i32* noalias %p) { + %p0 = getelementptr i32, i32* %p, i64 0 + %p1 = getelementptr i32, i32* %p, i64 1 + %p2 = getelementptr i32, i32* %p, i64 2 + %p3 = getelementptr i32, i32* %p, i64 3 + %l0 = load i32, i32* %p0, align 16 + %l1 = load i32, i32* %p1 + %l2 = load i32, i32* %p2 + %a0 = atomicrmw add i32* %p3, i32 1 monotonic + %l3 = load i32, i32* %p3 + store i32 %l0, i32* %p0, align 16 + %a1 = atomicrmw add i32* %p0, i32 2 monotonic + store i32 %l1, i32* %p1 + store i32 %l2, i32* %p2 + store i32 %l3, i32* %p3 + ret void +} + +; CHECK-LABEL: t3 +; CHECK: load <4 x float> +; CHECK: store <4 x float> +define void @t3(float* noalias %p, i32* noalias %q) { + %q0 = getelementptr i32, i32* %q, i64 0 + %q1 = getelementptr i32, i32* %q, i64 1 + %p0 = getelementptr float, float* %p, i64 0 + %p1 = getelementptr float, float* %p, i64 1 + %p2 = getelementptr float, float* %p, i64 2 + %p3 = getelementptr float, float* %p, i64 3 + %l0 = load float, float* %p0, align 16 + %l1 = load float, float* %p1 + %l2 = load float, float* %p2 + %a0 = cmpxchg i32* %q0, i32 0, i32 1 monotonic monotonic + %l3 = load float, float* %p3 + store float %l0, float* %p0, align 16 + %a1 = cmpxchg i32* %q1, i32 2, i32 3 monotonic monotonic + store float %l1, float* %p1 + store float %l2, float* %p2 + store float %l3, float* %p3 + ret void +} + +; CHECK-LABEL: f3 +; CHECK-NOT: load <4 x float> +; CHECK-NOT: store <4 x float> +; CHECK: ret void +define void @f3(float* noalias %p, i32* noalias %q) { + %q0 = getelementptr i32, i32* %q, i64 0 + %q1 = getelementptr i32, i32* %q, i64 1 + %p0 = getelementptr float, float* %p, i64 0 + %p1 = getelementptr float, float* %p, i64 1 + %p2 = getelementptr float, float* %p, i64 2 + %p3 = getelementptr float, float* %p, i64 3 + %l0 = load float, float* %p0, align 16 + %l1 = load float, float* %p1 + %l2 = load float, float* %p2 + %a0 = cmpxchg i32* %q0, i32 0, i32 1 acquire monotonic + %l3 = load float, float* %p3 + store float %l0, float* %p0, align 16 + %a1 = cmpxchg i32* %q1, i32 2, i32 3 acquire monotonic + store float %l1, float* %p1 + store float %l2, float* %p2 + store float %l3, float* %p3 + ret void +} + +; CHECK-LABEL: g3 +; CHECK-NOT: load <4 x i32> +; CHECK-NOT: store <4 x i32> +; CHECK: ret void +define void @g3(i32* noalias %p) { + %p0 = getelementptr i32, i32* %p, i64 0 + %p1 = getelementptr i32, i32* %p, i64 1 + %p2 = getelementptr i32, i32* %p, i64 2 + %p3 = getelementptr i32, i32* %p, i64 3 + %l0 = load i32, i32* %p0, align 16 + %l1 = load i32, i32* %p1 + %l2 = load i32, i32* %p2 + %a0 = cmpxchg i32* %p3, i32 0, i32 1 acquire monotonic + %l3 = load i32, i32* %p3 + store i32 %l0, i32* %p0, align 16 + %a1 = cmpxchg i32* %p0, i32 2, i32 3 acquire monotonic + store i32 %l1, i32* %p1 + store i32 %l2, i32* %p2 + store i32 %l3, i32* %p3 + ret void +}