diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -27,6 +27,9 @@ #define DEBUG_TYPE "ppctti" +static cl::opt VecMaskCost("ppc-vec-mask-cost", +cl::desc("add masking cost for i1 vectors"), cl::init(true), cl::Hidden); + static cl::opt DisablePPCConstHoist("disable-ppc-constant-hoisting", cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden); @@ -700,6 +703,9 @@ return Cost; } else if (Val->getScalarType()->isIntegerTy() && Index != -1U) { + unsigned EltSize = Val->getScalarSizeInBits(); + // Computing on 1 bit values requires extra mask or compare operations. + unsigned MaskCost = VecMaskCost && EltSize == 1 ? 1 : 0; if (ST->hasP9Altivec()) { if (ISD == ISD::INSERT_VECTOR_ELT) // A move-to VSR and a permute/insert. Assume vector operation cost @@ -721,12 +727,15 @@ // We need a vector extract (or mfvsrld). Assume vector operation cost. // The cost of the load constant for a vector extract is disregarded // (invariant, easily schedulable). - return CostFactor; + return CostFactor + MaskCost; - } else if (ST->hasDirectMove()) + } else if (ST->hasDirectMove()) { // Assume permute has standard cost. // Assume move-to/move-from VSR have 2x standard cost. - return 3; + if (ISD == ISD::INSERT_VECTOR_ELT) + return 3; + return 3 + MaskCost; + } } // Estimated cost of a load-hit-store delay. This was obtained diff --git a/llvm/test/Analysis/CostModel/PowerPC/reduce-and.ll b/llvm/test/Analysis/CostModel/PowerPC/reduce-and.ll --- a/llvm/test/Analysis/CostModel/PowerPC/reduce-and.ll +++ b/llvm/test/Analysis/CostModel/PowerPC/reduce-and.ll @@ -3,14 +3,14 @@ define i32 @reduce_i1(i32 %arg) { ; CHECK-LABEL: 'reduce_i1' -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 193 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 386 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 129 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 257 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 514 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) diff --git a/llvm/test/Analysis/CostModel/PowerPC/reduce-or.ll b/llvm/test/Analysis/CostModel/PowerPC/reduce-or.ll --- a/llvm/test/Analysis/CostModel/PowerPC/reduce-or.ll +++ b/llvm/test/Analysis/CostModel/PowerPC/reduce-or.ll @@ -3,14 +3,14 @@ define i32 @reduce_i1(i32 %arg) { ; CHECK-LABEL: 'reduce_i1' -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 193 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 386 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 129 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 257 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 514 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/predcost.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/predcost.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/predcost.ll @@ -0,0 +1,29 @@ +; RUN: opt -ppc-vec-mask-cost=true -aa-pipeline=basic-aa -mcpu=pwr8 -S -passes=loop-vectorize < %s | FileCheck %s + +target datalayout = "e-m:e-Fn32-i64:64-n32:64-S128-v256:256:256-v512:512:512" +target triple = "powerpc64le-unknown-linux-gnu" + +define dso_local void @_tc(ptr nocapture noundef %aaa, i64 noundef %bbb) local_unnamed_addr { +; CHECK-NOT: extractelement <16 x i1> +entry: + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.inc + ret void + +for.body: ; preds = %for.inc, %entry + %i.08 = phi i64 [ %inc, %for.inc ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i8, ptr %aaa, i64 %i.08 + %0 = load i8, ptr %arrayidx, align 1 + %cmp1 = icmp eq i8 %0, 0 + br i1 %cmp1, label %if.then, label %for.inc + +if.then: ; preds = %for.body + store i8 32, ptr %arrayidx, align 1 + br label %for.inc + +for.inc: ; preds = %if.then, %for.body + %inc = add nuw nsw i64 %i.08, 1 + %exitcond.not = icmp eq i64 %inc, %bbb + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +}