diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1514,14 +1514,14 @@ // Returns true if \p I is an instruction that will be predicated either // through scalar predication or masked load/store or masked gather/scatter. // Superset of instructions that return true for isScalarWithPredication. - bool isPredicatedInst(Instruction *I) { + bool isPredicatedInst(Instruction *I, ElementCount VF) { if (!blockNeedsPredication(I->getParent())) return false; // Loads and stores that need some form of masked operation are predicated // instructions. if (isa(I) || isa(I)) return Legal->isMaskRequired(I); - return isScalarWithPredication(I); + return isScalarWithPredication(I, VF); } /// Returns true if \p I is a memory instruction with consecutive memory @@ -6552,7 +6552,8 @@ // from moving "masked load/store" check from legality to cost model. // Masked Load/Gather emulation was previously never allowed. // Limited number of Masked Store/Scatter emulation was allowed. - assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); + assert(isPredicatedInst(I, ElementCount::getFixed(1)) && + "Expecting a scalar emulated instruction"); return isa(I) || (isa(I) && NumPredStores > NumberOfStoresToPredicate); @@ -6824,7 +6825,7 @@ // If we have a predicated load/store, it will need extra i1 extracts and // conditional branches, but may not be executed for each vector lane. Scale // the cost by the probability of executing the predicated block. - if (isPredicatedInst(I)) { + if (isPredicatedInst(I, ElementCount::getFixed(1))) { Cost /= getReciprocalPredBlockProb(); // Add the cost of an i1 extract and a branch @@ -8677,8 +8678,7 @@ Range); bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, - Range); + [&](ElementCount VF) { return CM.isPredicatedInst(I, VF); }, Range); auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), IsUniform, IsPredicated); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalarize-store-with-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalarize-store-with-predication.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalarize-store-with-predication.ll @@ -0,0 +1,52 @@ +; RUN: opt -loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -S -o - < %s | FileCheck %s +; RUN: opt -mattr=+sve -loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -S -o - < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; This test is defending against a bug that appeared when we have a target +; configuration where masked loads/stores are legal -- e.g. AArch64 with SVE. +; Predication would not be applied during interleaving, enabling the +; possibility of superfluous loads/stores which could result in miscompiles. +; This test checks that, when we disable vectorisation and force interleaving, +; stores are predicated properly. +; +; This is _not_ an SVE-specific test. The same bug could manifest on any +; architecture with masked loads/stores, but we use SVE for testing purposes +; here. + +define void @foo(i32* %data1, i32* %data2) { +; CHECK-LABEL: @foo( +; CHECK: vector.body: +; CHECK: br i1 {{%.*}}, label %pred.store.if, label %pred.store.continue +; CHECK: pred.store.if: +; CHECK-NEXT: store i32 {{%.*}}, i32* {{%.*}} +; CHECK-NEXT: br label %pred.store.continue +; CHECK: pred.store.continue: +; CHECK-NEXT: br i1 {{%.*}}, label %pred.store.if2, label %pred.store.continue3 +; CHECK: pred.store.if2: +; CHECK-NEXT: store i32 {{%.*}}, i32* {{%.*}} +; CHECK-NEXT: br label %pred.store.continue3 +; CHECK: pred.store.continue3: + +entry: + br label %while.body + +while.body: + %i = phi i64 [ 1023, %entry ], [ %i.next, %if.end ] + %arrayidx = getelementptr inbounds i32, i32* %data1, i64 %i + %ld = load i32, i32* %arrayidx, align 4 + %cmp = icmp sgt i32 %ld, %ld + br i1 %cmp, label %if.then, label %if.end + +if.then: + store i32 %ld, i32* %arrayidx, align 4 + br label %if.end + +if.end: + %i.next = add nsw i64 %i, -1 + %tobool.not = icmp eq i64 %i, 0 + br i1 %tobool.not, label %while.end, label %while.body + +while.end: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll --- a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll @@ -65,9 +65,12 @@ ; sink-scalar-operands optimization for predicated instructions. ; ; SINK-GATHER: vector.body: -; SINK-GATHER: pred.udiv.if: +; SINK-GATHER: pred.load.if: ; SINK-GATHER: %[[T0:.+]] = load i32, i32* %{{.*}}, align 4 -; SINK-GATHER: %{{.*}} = udiv i32 %[[T0]], %{{.*}} +; SINK-GATHER: pred.load.continue: +; SINK-GATHER: %[[T1:.+]] = phi i32 [ poison, %vector.body ], [ %[[T0]], %pred.load.if ] +; SINK-GATHER: pred.udiv.if: +; SINK-GATHER: %{{.*}} = udiv i32 %[[T1]], %{{.*}} ; SINK-GATHER: pred.udiv.continue: define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) { entry: