diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1509,14 +1509,17 @@ // Returns true if \p I is an instruction that will be predicated either // through scalar predication or masked load/store or masked gather/scatter. // Superset of instructions that return true for isScalarWithPredication. - bool isPredicatedInst(Instruction *I) { + // Optional parameter \p VF is unused by this function, and passed directly to + // isScalarWithPredication in the fall-back case. + bool isPredicatedInst(Instruction *I, + ElementCount VF = ElementCount::getFixed(1)) { if (!blockNeedsPredication(I->getParent())) return false; // Loads and stores that need some form of masked operation are predicated // instructions. if (isa(I) || isa(I)) return Legal->isMaskRequired(I); - return isScalarWithPredication(I); + return isScalarWithPredication(I, VF); } /// Returns true if \p I is a memory instruction with consecutive memory @@ -8594,8 +8597,7 @@ Range); bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); }, - Range); + [&](ElementCount VF) { return CM.isPredicatedInst(I, VF); }, Range); auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()), IsUniform, IsPredicated); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalarize-store-with-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalarize-store-with-predication.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalarize-store-with-predication.ll @@ -0,0 +1,76 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -loop-vectorize -dce -instcombine \ +; RUN: -debug-only=loop-vectorize \ +; RUN: -S -o - 2>%t < %s | FileCheck %s +; RUN: FileCheck --check-prefix=DBG %s < %t + +target triple = "aarch64-unknown-linux-gnu" + +; +; IR generated from (approximately): +; +; 1 void foo(int *restrict data1, int *restrict data2) +; 2 { +; 3 int counter = 1024; +; 4 while (counter--) +; 5 if (data1[counter] > data2[counter]) +; 6 data1[counter] = data2[counter]; +; 7 } +; + +define void @foo(i32* %data1, i32* %data2) #0 { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[WHILE_BODY:%.*]] +; CHECK: while.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 1023, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[DATA1:%.*]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[DATA2:%.*]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END]] +; CHECK: if.then: +; CHECK-NEXT: store i32 [[TMP1]], i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], 0 +; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.end: +; CHECK-NEXT: ret void +; +; DBG: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, i32* %data1, i64 %indvars.iv +; DBG-NEXT: LV: Scalarizing: %0 = load i32, i32* %arrayidx, align 4 +; DBG-NEXT: LV: Scalarizing: %arrayidx2 = getelementptr inbounds i32, i32* %data2, i64 %indvars.iv +; DBG-NEXT: LV: Scalarizing: %1 = load i32, i32* %arrayidx2, align 4 +; DBG-NEXT: LV: Scalarizing: %cmp = icmp sgt i32 %0, %1 +; DBG-NEXT: LV: Scalarizing and predicating: store i32 %1, i32* %arrayidx, align 4 +; DBG-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, i32* %data1, i64 %indvars.iv +; DBG-NEXT: LV: Scalarizing: %arrayidx2 = getelementptr inbounds i32, i32* %data2, i64 %indvars.iv +entry: + br label %while.body + +while.body: + %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %if.end ] + %arrayidx = getelementptr inbounds i32, i32* %data1, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %data2, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4 + %cmp = icmp sgt i32 %0, %1 + br i1 %cmp, label %if.then, label %if.end + +if.then: + store i32 %1, i32* %arrayidx, align 4 + br label %if.end + +if.end: + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %tobool.not = icmp eq i64 %indvars.iv, 0 + br i1 %tobool.not, label %while.end, label %while.body + +while.end: + ret void +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll --- a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll @@ -65,9 +65,11 @@ ; sink-scalar-operands optimization for predicated instructions. ; ; SINK-GATHER: vector.body: +; SINK-GATHER: pred.load.if: +; SINK-GATHER: %{{.*}} = load i32, i32* %{{.*}}, align 4 +; SINK-GATHER: pred.load.continue: ; SINK-GATHER: pred.udiv.if: -; SINK-GATHER: %[[T0:.+]] = load i32, i32* %{{.*}}, align 4 -; SINK-GATHER: %{{.*}} = udiv i32 %[[T0]], %{{.*}} +; SINK-GATHER: %{{.*}} = udiv i32 %{{.*}}, %{{.*}} ; SINK-GATHER: pred.udiv.continue: define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) { entry: