Index: llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h =================================================================== --- llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -129,6 +129,19 @@ MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); + InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, + const Value *Ptr, bool VariableMask, + Align Alignment, + TTI::TargetCostKind CostKind, + const Instruction *I); + bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment); + bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) { + return forceScalarizeMaskedGather(VTy, Alignment); + } + bool isLegalMaskedGather(Type *DataType, Align Alignment); + bool isLegalMaskedScatter(Type *DataType, Align Alignment) { + return isLegalMaskedGather(DataType, Alignment); + } InstructionCost getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, Index: llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -24,6 +24,12 @@ using namespace llvm; #define DEBUG_TYPE "ppctti" +static cl::opt EnableLegalMaskedGather("ppc-enable-gather", cl::Hidden, + cl::init(false), + cl::desc("Enable masked gather")); +static cl::opt + ForceGatherCost("force-gather-cost", cl::Hidden, cl::init(0), + cl::desc("For debugging, sets user specified cost")); static cl::opt DisablePPCConstHoist("disable-ppc-constant-hoisting", cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden); @@ -1238,6 +1244,45 @@ return Cost; } +bool PPCTTIImpl::forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) { + return EnableLegalMaskedGather; +} + +bool PPCTTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) { + if (!EnableLegalMaskedGather) + return false; + unsigned EltWidth = DataTy->getScalarSizeInBits(); + return ((EltWidth == 64 && Alignment >= 8) || + (EltWidth == 32 && Alignment >= 4) || + (EltWidth == 16 && Alignment >= 2) || EltWidth == 8); +} + +/// Calculate the cost of Gather/Scatter operation +InstructionCost PPCTTIImpl::getGatherScatterOpCost( + unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, + Align Alignment, TTI::TargetCostKind CostKind, + const Instruction *I = nullptr) { + if (!EnableLegalMaskedGather) + return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, + Alignment, CostKind, I); + + if (ForceGatherCost.getNumOccurrences() > 0) + return InstructionCost(ForceGatherCost); + + assert(DataTy->isVectorTy() && "Unexpected data type for Gather/Scatter"); + auto *VTy = cast(DataTy); + + std::pair LT = TLI->getTypeLegalizationCost(DL, DataTy); + + // We are currently using forceScalarizeMaskedGather to lower the + // gather/scatter instructions. So the cost will be the cost of scalarizing. + unsigned NumElems = VTy->getNumElements(); + InstructionCost Cost = + NumElems * LT.first + getScalarizationOverhead(VTy, true, false); + + return Cost; +} + InstructionCost PPCTTIImpl::getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, Index: llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-masked-gather.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-masked-gather.ll @@ -0,0 +1,51 @@ +; RUN: opt -S -loop-vectorize -mtriple=powerpc64le-unknown-unknown \ +; RUN: -ppc-enable-gather=true -force-gather-cost=1 -mcpu=pwr9 < %s | FileCheck %s + +; CHECK-LABEL: @vmul +; CHECK: vector.body: +; CHECK: call <2 x double> @llvm.masked.gather +define dso_local noundef double @vmul(i32* noundef readonly %rowstart, i32* noundef readnone %rowend, double* nocapture noundef readonly %luval, double* nocapture noundef readonly %dst) { +entry: + %cmp.not8 = icmp eq i32* %rowstart, %rowend + br i1 %cmp.not8, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + %add.lcssa = phi double [ %add, %for.body ] + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %res.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ] + ret double %res.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %col.011 = phi i32* [ %incdec.ptr, %for.body ], [ %rowstart, %for.body.preheader ] + %res.010 = phi double [ %add, %for.body ], [ 0.000000e+00, %for.body.preheader ] + %luval.addr.09 = phi double* [ %incdec.ptr1, %for.body ], [ %luval, %for.body.preheader ] + %0 = load double, double* %luval.addr.09, align 8, !tbaa !3 + %1 = load i32, i32* %col.011, align 4, !tbaa !7 + %idxprom = zext i32 %1 to i64 + %arrayidx = getelementptr inbounds double, double* %dst, i64 %idxprom + %2 = load double, double* %arrayidx, align 8, !tbaa !3 + %mul = fmul fast double %2, %0 + %add = fadd fast double %mul, %res.010 + %incdec.ptr = getelementptr inbounds i32, i32* %col.011, i64 1 + %incdec.ptr1 = getelementptr inbounds double, double* %luval.addr.09, i64 1 + %cmp.not = icmp eq i32* %incdec.ptr, %rowend + br i1 %cmp.not, label %for.cond.cleanup.loopexit, label %for.body +} + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"uwtable", i32 1} +!2 = !{!"XL C/C++ for Linux on Power, (IBM Internal Development Branch), clang version 14.0.0"} +!3 = !{!4, !4, i64 0} +!4 = !{!"double", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C++ TBAA"} +!7 = !{!8, !8, i64 0} +!8 = !{!"int", !5, i64 0} Index: llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-masked-scatter.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-masked-scatter.ll @@ -0,0 +1,45 @@ +; RUN: opt -S -loop-vectorize -mtriple=powerpc64le-unknown-unknown \ +; RUN: -ppc-enable-gather=true -force-gather-cost=1 -mcpu=pwr9 < %s | FileCheck %s + +; CHECK-LABEL: entry: +; CHECK: vector.body: +; CHECK: call void @llvm.masked.scatter +define dso_local void @test(float* noalias nocapture noundef readonly %in, float* noalias nocapture noundef writeonly %out, i32* noalias nocapture noundef readonly %index, i32 noundef signext %SIZE) { +entry: + %cmp9 = icmp sgt i32 %SIZE, 0 + br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %SIZE to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %in, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4, !tbaa !3 + %add = fadd fast float %0, 5.000000e-01 + %arrayidx2 = getelementptr inbounds i32, i32* %index, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4, !tbaa !7 + %idxprom3 = sext i32 %1 to i64 + %arrayidx4 = getelementptr inbounds float, float* %out, i64 %idxprom3 + store float %add, float* %arrayidx4, align 4, !tbaa !3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"uwtable", i32 1} +!2 = !{!"XL C/C++ for Linux on Power, (IBM Internal Development Branch), clang version 14.0.0"} +!3 = !{!4, !4, i64 0} +!4 = !{!"float", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C/C++ TBAA"} +!7 = !{!8, !8, i64 0} +!8 = !{!"int", !5, i64 0} Index: llvm/test/Transforms/SLPVectorizer/PowerPC/slp-masked-gather.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/SLPVectorizer/PowerPC/slp-masked-gather.ll @@ -0,0 +1,63 @@ +; RUN: opt -S -mtriple=powerpc64-linux-gnu -mcpu=pwr9 -mattr=+vsx -slp-vectorizer -ppc-enable-gather=true -force-gather-cost=1 < %s | FileCheck %s + +; CHECK-LABEL: entry: +; CHECK: call <4 x i32> @llvm.masked.gather +define dso_local void @test(i32** noalias nocapture noundef readonly %a, i32* noalias nocapture noundef writeonly %b) { +entry: + %arrayidx = getelementptr inbounds i32*, i32** %a, i64 5 + %0 = load i32*, i32** %arrayidx, align 8, !tbaa !3 + %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 18 + %1 = load i32, i32* %arrayidx1, align 4, !tbaa !7 + %sub = add nsw i32 %1, -10 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 1 + store i32 %sub, i32* %arrayidx2, align 4, !tbaa !7 + %arrayidx4 = getelementptr inbounds i32, i32* %0, i64 27 + %2 = load i32, i32* %arrayidx4, align 4, !tbaa !7 + %sub5 = add nsw i32 %2, -10 + %arrayidx6 = getelementptr inbounds i32, i32* %b, i64 2 + store i32 %sub5, i32* %arrayidx6, align 4, !tbaa !7 + %arrayidx8 = getelementptr inbounds i32, i32* %0, i64 36 + %3 = load i32, i32* %arrayidx8, align 4, !tbaa !7 + %sub9 = add nsw i32 %3, -10 + %arrayidx10 = getelementptr inbounds i32, i32* %b, i64 3 + store i32 %sub9, i32* %arrayidx10, align 4, !tbaa !7 + %arrayidx12 = getelementptr inbounds i32, i32* %0, i64 45 + %4 = load i32, i32* %arrayidx12, align 4, !tbaa !7 + %sub13 = add nsw i32 %4, -10 + %arrayidx14 = getelementptr inbounds i32, i32* %b, i64 4 + store i32 %sub13, i32* %arrayidx14, align 4, !tbaa !7 + %arrayidx16 = getelementptr inbounds i32, i32* %0, i64 54 + %5 = load i32, i32* %arrayidx16, align 4, !tbaa !7 + %sub17 = add nsw i32 %5, -10 + %arrayidx18 = getelementptr inbounds i32, i32* %b, i64 5 + store i32 %sub17, i32* %arrayidx18, align 4, !tbaa !7 + %arrayidx20 = getelementptr inbounds i32, i32* %0, i64 63 + %6 = load i32, i32* %arrayidx20, align 4, !tbaa !7 + %sub21 = add nsw i32 %6, -10 + %arrayidx22 = getelementptr inbounds i32, i32* %b, i64 6 + store i32 %sub21, i32* %arrayidx22, align 4, !tbaa !7 + %arrayidx24 = getelementptr inbounds i32, i32* %0, i64 72 + %7 = load i32, i32* %arrayidx24, align 4, !tbaa !7 + %sub25 = add nsw i32 %7, -10 + %arrayidx26 = getelementptr inbounds i32, i32* %b, i64 7 + store i32 %sub25, i32* %arrayidx26, align 4, !tbaa !7 + %arrayidx28 = getelementptr inbounds i32, i32* %0, i64 81 + %8 = load i32, i32* %arrayidx28, align 4, !tbaa !7 + %sub29 = add nsw i32 %8, -10 + %arrayidx30 = getelementptr inbounds i32, i32* %b, i64 8 + store i32 %sub29, i32* %arrayidx30, align 4, !tbaa !7 + ret void +} + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"uwtable", i32 1} +!2 = !{!"XL C/C++ for Linux on Power, (IBM Internal Development Branch), clang version 14.0.0"} +!3 = !{!4, !4, i64 0} +!4 = !{!"any pointer", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C++ TBAA"} +!7 = !{!8, !8, i64 0} +!8 = !{!"int", !5, i64 0}