diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1279,6 +1279,8 @@ SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECREDUCE_ADD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -906,6 +906,14 @@ else setOperationAction(ISD::MUL, MVT::v4i32, Custom); + if (Subtarget.isISA3_0()) { + setOperationAction(ISD::VECREDUCE_ADD, MVT::v16i8, Legal); + // This has to be custom because we want to lower it before type + // legalization gets to the type and expands everything out. + setOperationAction(ISD::VECREDUCE_ADD, MVT::v16i16, Custom); + setOperationAction(ISD::VECREDUCE_ADD, MVT::v16i32, Custom); + } + if (Subtarget.isISA3_1()) { setOperationAction(ISD::MUL, MVT::v2i64, Legal); setOperationAction(ISD::MULHS, MVT::v2i64, Legal); @@ -10729,6 +10737,61 @@ } } +SDValue PPCTargetLowering::LowerVECREDUCE_ADD(SDValue Op, + SelectionDAG &DAG) const { + SDValue VecInput = Op.getOperand(0); + MVT InputType = VecInput.getSimpleValueType(); + SDLoc dl(Op); + + // A vector with all zeros for the form: {0, 0, 0, 0} + SDValue ZeroVector = DAG.getTargetConstant(0, dl, MVT::v4i32); + // A vector where all bytes are one of the form: + // {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1} + SDNode *AllOnes = DAG.getMachineNode(PPC::XXSPLTIB, dl, MVT::v16i8, + DAG.getTargetConstant(1, dl, MVT::i32)); + + switch (InputType.SimpleTy) { + default: + return SDValue(); + case MVT::v16i32: + case MVT::v16i16: { + if (VecInput.getOpcode() != ISD::SIGN_EXTEND && + VecInput.getOpcode() != ISD::ZERO_EXTEND) + return SDValue(); + + // Check that we are extending from v16i8 to v16i32. + if (VecInput.getOperand(0).getSimpleValueType() != MVT::v16i8) + return SDValue(); + + // Sum Bytes and place the result in each word. + // {a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p} turns into + // {a+b+c+d, e+f+g+h, i+j+k+l, m+n+o+p} + SDNode *SumBytes = DAG.getMachineNode(PPC::VMSUMUBM, dl, MVT::v4i32, + VecInput.getOperand(0), + SDValue(AllOnes, 0), ZeroVector); + // Sum words and place result in single word. + /// {a, b, c, d} turns into {0, 0, 0, a+b+c+d} + SDNode *SumAll = DAG.getMachineNode(PPC::VSUMSWS, dl, MVT::v2i64, + SDValue(SumBytes, 0), ZeroVector); + + if (InputType.SimpleTy == MVT::v16i32) { + SDValue Bitcast = + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, SDValue(SumAll, 0)); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Bitcast, + DAG.getTargetConstant(3, dl, MVT::i32)); + } else { + SDValue Bitcast = + DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, SDValue(SumAll, 0)); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Bitcast, + DAG.getTargetConstant(7, dl, MVT::i32)); + } + break; + } + } + + return SDValue(); +} + SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -11113,6 +11176,8 @@ return LowerATOMIC_CMP_SWAP(Op, DAG); case ISD::ATOMIC_STORE: return LowerATOMIC_LOAD_STORE(Op, DAG); + case ISD::VECREDUCE_ADD: + return LowerVECREDUCE_ADD(Op, DAG); } } diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -4126,6 +4126,17 @@ (v8i16 (VSPLTHs 3, (LXSIHZX ForceXForm:$A)))>; def : Pat<(v16i8 (PPCldsplat ForceXForm:$A)), (v16i8 (VSPLTBs 7, (LXSIBZX ForceXForm:$A)))>; + +// Vector Sum Reductions +def : Pat<(i64 (anyext (i32 (vecreduce_add v16i8:$vA)))), + (i64 (MFVSRLD (VSUMSWS + (VMSUMUBM v16i8:$vA, (v16i8 (COPY_TO_REGCLASS (XXSPLTIB 1), VSRC)), (v4i32(V_SET0))), + (v4i32(V_SET0)))))>; +def : Pat<(i32 (vecreduce_add v16i8:$vA)), + (i32 (EXTRACT_SUBREG (MFVSRLD (VSUMSWS + (VMSUMUBM v16i8:$vA, (v16i8 (COPY_TO_REGCLASS (XXSPLTIB 1), VSRC)), (v4i32(V_SET0))), + (v4i32(V_SET0)))), sub_32))>; + } // HasVSX, HasP9Vector // Any Power9 VSX subtarget with equivalent length but better Power10 VSX diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -63,6 +63,7 @@ TTI::TargetCostKind CostKind); TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); + bool shouldExpandReduction(const IntrinsicInst *II) const; bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -64,6 +64,43 @@ return TTI::PSK_Software; } +bool PPCTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const { + const unsigned Directive = ST->getCPUDirective(); + + // P9 and up only. + if (Directive != PPC::DIR_PWR9 && Directive != PPC::DIR_PWR10 && + Directive != PPC::DIR_PWR_FUTURE) + return true; + + // Check if we are doing a vector reduce add. + if (II->getIntrinsicID() != Intrinsic::vector_reduce_add) + return true; + + Value *Operand = II->getArgOperand(0); + Type *OpType = Operand->getType(); + Type *V16x8Ty = + FixedVectorType::get(Type::getInt8Ty(Operand->getContext()), 16); + Type *V16x16Ty = + FixedVectorType::get(Type::getInt16Ty(Operand->getContext()), 16); + Type *V16x32Ty = + FixedVectorType::get(Type::getInt32Ty(Operand->getContext()), 16); + + Instruction *OperandInstr = dyn_cast(Operand); + if (!OperandInstr) + return true; + + // Only expand types that are fed by zero extends or sign extends. + unsigned OperandOpcode = OperandInstr->getOpcode(); + if (OperandOpcode != Instruction::SExt && OperandOpcode != Instruction::ZExt) + return true; + + if (OpType == V16x8Ty || OpType == V16x16Ty || OpType == V16x32Ty) + return false; + + // By default we want to expand everything we cannot cover. + return true; +} + Optional PPCTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { Intrinsic::ID IID = II.getIntrinsicID(); diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-add.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-add.ll --- a/llvm/test/CodeGen/PowerPC/vector-reduce-add.ll +++ b/llvm/test/CodeGen/PowerPC/vector-reduce-add.ll @@ -549,84 +549,44 @@ define dso_local signext i16 @v16i8tov16i16_sign(<16 x i8> %a) local_unnamed_addr #0 { ; PWR9LE-LABEL: v16i8tov16i16_sign: ; PWR9LE: # %bb.0: # %entry -; PWR9LE-NEXT: vmrghb v3, v2, v2 -; PWR9LE-NEXT: vspltish v4, 8 -; PWR9LE-NEXT: li r3, 0 -; PWR9LE-NEXT: vmrglb v2, v2, v2 -; PWR9LE-NEXT: vslh v3, v3, v4 -; PWR9LE-NEXT: vslh v2, v2, v4 -; PWR9LE-NEXT: vsrah v3, v3, v4 -; PWR9LE-NEXT: vsrah v2, v2, v4 -; PWR9LE-NEXT: vadduhm v2, v2, v3 -; PWR9LE-NEXT: xxswapd v3, v2 -; PWR9LE-NEXT: vadduhm v2, v2, v3 -; PWR9LE-NEXT: xxspltw v3, v2, 2 -; PWR9LE-NEXT: vadduhm v2, v2, v3 -; PWR9LE-NEXT: vsplth v3, v2, 6 -; PWR9LE-NEXT: vadduhm v2, v2, v3 +; PWR9LE-NEXT: xxspltib v4, 1 +; PWR9LE-NEXT: xxlxor v3, v3, v3 +; PWR9LE-NEXT: li r3, 14 +; PWR9LE-NEXT: vmsumubm v2, v2, v4, v3 +; PWR9LE-NEXT: vsumsws v2, v2, v3 ; PWR9LE-NEXT: vextuhrx r3, r3, v2 ; PWR9LE-NEXT: extsh r3, r3 ; PWR9LE-NEXT: blr ; ; PWR9BE-LABEL: v16i8tov16i16_sign: ; PWR9BE: # %bb.0: # %entry -; PWR9BE-NEXT: vmrglb v3, v2, v2 -; PWR9BE-NEXT: vspltish v4, 8 -; PWR9BE-NEXT: li r3, 0 -; PWR9BE-NEXT: vmrghb v2, v2, v2 -; PWR9BE-NEXT: vslh v3, v3, v4 -; PWR9BE-NEXT: vslh v2, v2, v4 -; PWR9BE-NEXT: vsrah v3, v3, v4 -; PWR9BE-NEXT: vsrah v2, v2, v4 -; PWR9BE-NEXT: vadduhm v2, v2, v3 -; PWR9BE-NEXT: xxswapd v3, v2 -; PWR9BE-NEXT: vadduhm v2, v2, v3 -; PWR9BE-NEXT: xxspltw v3, v2, 1 -; PWR9BE-NEXT: vadduhm v2, v2, v3 -; PWR9BE-NEXT: vsplth v3, v2, 1 -; PWR9BE-NEXT: vadduhm v2, v2, v3 +; PWR9BE-NEXT: xxspltib v4, 1 +; PWR9BE-NEXT: xxlxor v3, v3, v3 +; PWR9BE-NEXT: li r3, 14 +; PWR9BE-NEXT: vmsumubm v2, v2, v4, v3 +; PWR9BE-NEXT: vsumsws v2, v2, v3 ; PWR9BE-NEXT: vextuhlx r3, r3, v2 ; PWR9BE-NEXT: extsh r3, r3 ; PWR9BE-NEXT: blr ; ; PWR10LE-LABEL: v16i8tov16i16_sign: ; PWR10LE: # %bb.0: # %entry -; PWR10LE-NEXT: vmrghb v3, v2, v2 -; PWR10LE-NEXT: xxspltiw v4, 524296 -; PWR10LE-NEXT: vmrglb v2, v2, v2 -; PWR10LE-NEXT: li r3, 0 -; PWR10LE-NEXT: vslh v3, v3, v4 -; PWR10LE-NEXT: vslh v2, v2, v4 -; PWR10LE-NEXT: vsrah v3, v3, v4 -; PWR10LE-NEXT: vsrah v2, v2, v4 -; PWR10LE-NEXT: vadduhm v2, v2, v3 -; PWR10LE-NEXT: xxswapd v3, v2 -; PWR10LE-NEXT: vadduhm v2, v2, v3 -; PWR10LE-NEXT: xxspltw v3, v2, 2 -; PWR10LE-NEXT: vadduhm v2, v2, v3 -; PWR10LE-NEXT: vsplth v3, v2, 6 -; PWR10LE-NEXT: vadduhm v2, v2, v3 +; PWR10LE-NEXT: xxspltib v4, 1 +; PWR10LE-NEXT: xxlxor v3, v3, v3 +; PWR10LE-NEXT: li r3, 14 +; PWR10LE-NEXT: vmsumubm v2, v2, v4, v3 +; PWR10LE-NEXT: vsumsws v2, v2, v3 ; PWR10LE-NEXT: vextuhrx r3, r3, v2 ; PWR10LE-NEXT: extsh r3, r3 ; PWR10LE-NEXT: blr ; ; PWR10BE-LABEL: v16i8tov16i16_sign: ; PWR10BE: # %bb.0: # %entry -; PWR10BE-NEXT: vmrglb v3, v2, v2 -; PWR10BE-NEXT: xxspltiw v4, 524296 -; PWR10BE-NEXT: vmrghb v2, v2, v2 -; PWR10BE-NEXT: li r3, 0 -; PWR10BE-NEXT: vslh v3, v3, v4 -; PWR10BE-NEXT: vslh v2, v2, v4 -; PWR10BE-NEXT: vsrah v3, v3, v4 -; PWR10BE-NEXT: vsrah v2, v2, v4 -; PWR10BE-NEXT: vadduhm v2, v2, v3 -; PWR10BE-NEXT: xxswapd v3, v2 -; PWR10BE-NEXT: vadduhm v2, v2, v3 -; PWR10BE-NEXT: xxspltw v3, v2, 1 -; PWR10BE-NEXT: vadduhm v2, v2, v3 -; PWR10BE-NEXT: vsplth v3, v2, 1 -; PWR10BE-NEXT: vadduhm v2, v2, v3 +; PWR10BE-NEXT: xxspltib v4, 1 +; PWR10BE-NEXT: xxlxor v3, v3, v3 +; PWR10BE-NEXT: li r3, 14 +; PWR10BE-NEXT: vmsumubm v2, v2, v4, v3 +; PWR10BE-NEXT: vsumsws v2, v2, v3 ; PWR10BE-NEXT: vextuhlx r3, r3, v2 ; PWR10BE-NEXT: extsh r3, r3 ; PWR10BE-NEXT: blr @@ -639,68 +599,44 @@ define dso_local zeroext i16 @v16i8tov16i16_zero(<16 x i8> %a) local_unnamed_addr #0 { ; PWR9LE-LABEL: v16i8tov16i16_zero: ; PWR9LE: # %bb.0: # %entry +; PWR9LE-NEXT: xxspltib v4, 1 ; PWR9LE-NEXT: xxlxor v3, v3, v3 -; PWR9LE-NEXT: li r3, 0 -; PWR9LE-NEXT: vmrghb v4, v3, v2 -; PWR9LE-NEXT: vmrglb v2, v3, v2 -; PWR9LE-NEXT: vadduhm v2, v2, v4 -; PWR9LE-NEXT: xxswapd v3, v2 -; PWR9LE-NEXT: vadduhm v2, v2, v3 -; PWR9LE-NEXT: xxspltw v3, v2, 2 -; PWR9LE-NEXT: vadduhm v2, v2, v3 -; PWR9LE-NEXT: vsplth v3, v2, 6 -; PWR9LE-NEXT: vadduhm v2, v2, v3 +; PWR9LE-NEXT: li r3, 14 +; PWR9LE-NEXT: vmsumubm v2, v2, v4, v3 +; PWR9LE-NEXT: vsumsws v2, v2, v3 ; PWR9LE-NEXT: vextuhrx r3, r3, v2 ; PWR9LE-NEXT: clrldi r3, r3, 48 ; PWR9LE-NEXT: blr ; ; PWR9BE-LABEL: v16i8tov16i16_zero: ; PWR9BE: # %bb.0: # %entry +; PWR9BE-NEXT: xxspltib v4, 1 ; PWR9BE-NEXT: xxlxor v3, v3, v3 -; PWR9BE-NEXT: li r3, 0 -; PWR9BE-NEXT: vmrglb v4, v3, v2 -; PWR9BE-NEXT: vmrghb v2, v3, v2 -; PWR9BE-NEXT: vadduhm v2, v2, v4 -; PWR9BE-NEXT: xxswapd v3, v2 -; PWR9BE-NEXT: vadduhm v2, v2, v3 -; PWR9BE-NEXT: xxspltw v3, v2, 1 -; PWR9BE-NEXT: vadduhm v2, v2, v3 -; PWR9BE-NEXT: vsplth v3, v2, 1 -; PWR9BE-NEXT: vadduhm v2, v2, v3 +; PWR9BE-NEXT: li r3, 14 +; PWR9BE-NEXT: vmsumubm v2, v2, v4, v3 +; PWR9BE-NEXT: vsumsws v2, v2, v3 ; PWR9BE-NEXT: vextuhlx r3, r3, v2 ; PWR9BE-NEXT: clrldi r3, r3, 48 ; PWR9BE-NEXT: blr ; ; PWR10LE-LABEL: v16i8tov16i16_zero: ; PWR10LE: # %bb.0: # %entry +; PWR10LE-NEXT: xxspltib v4, 1 ; PWR10LE-NEXT: xxlxor v3, v3, v3 -; PWR10LE-NEXT: li r3, 0 -; PWR10LE-NEXT: vmrghb v4, v3, v2 -; PWR10LE-NEXT: vmrglb v2, v3, v2 -; PWR10LE-NEXT: vadduhm v2, v2, v4 -; PWR10LE-NEXT: xxswapd v3, v2 -; PWR10LE-NEXT: vadduhm v2, v2, v3 -; PWR10LE-NEXT: xxspltw v3, v2, 2 -; PWR10LE-NEXT: vadduhm v2, v2, v3 -; PWR10LE-NEXT: vsplth v3, v2, 6 -; PWR10LE-NEXT: vadduhm v2, v2, v3 +; PWR10LE-NEXT: li r3, 14 +; PWR10LE-NEXT: vmsumubm v2, v2, v4, v3 +; PWR10LE-NEXT: vsumsws v2, v2, v3 ; PWR10LE-NEXT: vextuhrx r3, r3, v2 ; PWR10LE-NEXT: clrldi r3, r3, 48 ; PWR10LE-NEXT: blr ; ; PWR10BE-LABEL: v16i8tov16i16_zero: ; PWR10BE: # %bb.0: # %entry +; PWR10BE-NEXT: xxspltib v4, 1 ; PWR10BE-NEXT: xxlxor v3, v3, v3 -; PWR10BE-NEXT: li r3, 0 -; PWR10BE-NEXT: vmrglb v4, v3, v2 -; PWR10BE-NEXT: vmrghb v2, v3, v2 -; PWR10BE-NEXT: vadduhm v2, v2, v4 -; PWR10BE-NEXT: xxswapd v3, v2 -; PWR10BE-NEXT: vadduhm v2, v2, v3 -; PWR10BE-NEXT: xxspltw v3, v2, 1 -; PWR10BE-NEXT: vadduhm v2, v2, v3 -; PWR10BE-NEXT: vsplth v3, v2, 1 -; PWR10BE-NEXT: vadduhm v2, v2, v3 +; PWR10BE-NEXT: li r3, 14 +; PWR10BE-NEXT: vmsumubm v2, v2, v4, v3 +; PWR10BE-NEXT: vsumsws v2, v2, v3 ; PWR10BE-NEXT: vextuhlx r3, r3, v2 ; PWR10BE-NEXT: clrldi r3, r3, 48 ; PWR10BE-NEXT: blr @@ -982,128 +918,44 @@ define dso_local signext i32 @v16i8tov16i32_sign(<16 x i8> %a) local_unnamed_addr #0 { ; PWR9LE-LABEL: v16i8tov16i32_sign: ; PWR9LE: # %bb.0: # %entry -; PWR9LE-NEXT: addis r3, r2, .LCPI17_0@toc@ha -; PWR9LE-NEXT: addi r3, r3, .LCPI17_0@toc@l -; PWR9LE-NEXT: lxv v3, 0(r3) -; PWR9LE-NEXT: addis r3, r2, .LCPI17_1@toc@ha -; PWR9LE-NEXT: addi r3, r3, .LCPI17_1@toc@l -; PWR9LE-NEXT: lxv v4, 0(r3) -; PWR9LE-NEXT: addis r3, r2, .LCPI17_2@toc@ha -; PWR9LE-NEXT: vperm v3, v2, v2, v3 -; PWR9LE-NEXT: addi r3, r3, .LCPI17_2@toc@l -; PWR9LE-NEXT: lxv v5, 0(r3) -; PWR9LE-NEXT: addis r3, r2, .LCPI17_3@toc@ha -; PWR9LE-NEXT: vextsb2w v3, v3 -; PWR9LE-NEXT: vperm v4, v2, v2, v4 -; PWR9LE-NEXT: addi r3, r3, .LCPI17_3@toc@l -; PWR9LE-NEXT: lxv v0, 0(r3) -; PWR9LE-NEXT: vextsb2w v4, v4 -; PWR9LE-NEXT: li r3, 0 -; PWR9LE-NEXT: vperm v5, v2, v2, v5 -; PWR9LE-NEXT: vadduwm v3, v4, v3 -; PWR9LE-NEXT: vextsb2w v5, v5 -; PWR9LE-NEXT: vperm v2, v2, v2, v0 -; PWR9LE-NEXT: vextsb2w v2, v2 -; PWR9LE-NEXT: vadduwm v2, v2, v5 -; PWR9LE-NEXT: vadduwm v2, v3, v2 -; PWR9LE-NEXT: xxswapd v3, v2 -; PWR9LE-NEXT: vadduwm v2, v2, v3 -; PWR9LE-NEXT: xxspltw v3, v2, 2 -; PWR9LE-NEXT: vadduwm v2, v2, v3 +; PWR9LE-NEXT: xxspltib v4, 1 +; PWR9LE-NEXT: xxlxor v3, v3, v3 +; PWR9LE-NEXT: li r3, 12 +; PWR9LE-NEXT: vmsumubm v2, v2, v4, v3 +; PWR9LE-NEXT: vsumsws v2, v2, v3 ; PWR9LE-NEXT: vextuwrx r3, r3, v2 ; PWR9LE-NEXT: extsw r3, r3 ; PWR9LE-NEXT: blr ; ; PWR9BE-LABEL: v16i8tov16i32_sign: ; PWR9BE: # %bb.0: # %entry -; PWR9BE-NEXT: addis r3, r2, .LCPI17_0@toc@ha -; PWR9BE-NEXT: addi r3, r3, .LCPI17_0@toc@l -; PWR9BE-NEXT: lxv v3, 0(r3) -; PWR9BE-NEXT: addis r3, r2, .LCPI17_1@toc@ha -; PWR9BE-NEXT: addi r3, r3, .LCPI17_1@toc@l -; PWR9BE-NEXT: lxv v4, 0(r3) -; PWR9BE-NEXT: addis r3, r2, .LCPI17_2@toc@ha -; PWR9BE-NEXT: vperm v3, v2, v2, v3 -; PWR9BE-NEXT: addi r3, r3, .LCPI17_2@toc@l -; PWR9BE-NEXT: lxv v5, 0(r3) -; PWR9BE-NEXT: addis r3, r2, .LCPI17_3@toc@ha -; PWR9BE-NEXT: vextsb2w v3, v3 -; PWR9BE-NEXT: vperm v4, v2, v2, v4 -; PWR9BE-NEXT: addi r3, r3, .LCPI17_3@toc@l -; PWR9BE-NEXT: lxv v0, 0(r3) -; PWR9BE-NEXT: vextsb2w v4, v4 -; PWR9BE-NEXT: li r3, 0 -; PWR9BE-NEXT: vperm v5, v2, v2, v5 -; PWR9BE-NEXT: vadduwm v3, v4, v3 -; PWR9BE-NEXT: vextsb2w v5, v5 -; PWR9BE-NEXT: vperm v2, v2, v2, v0 -; PWR9BE-NEXT: vextsb2w v2, v2 -; PWR9BE-NEXT: vadduwm v2, v2, v5 -; PWR9BE-NEXT: vadduwm v2, v3, v2 -; PWR9BE-NEXT: xxswapd v3, v2 -; PWR9BE-NEXT: vadduwm v2, v2, v3 -; PWR9BE-NEXT: xxspltw v3, v2, 1 -; PWR9BE-NEXT: vadduwm v2, v2, v3 +; PWR9BE-NEXT: xxspltib v4, 1 +; PWR9BE-NEXT: xxlxor v3, v3, v3 +; PWR9BE-NEXT: li r3, 12 +; PWR9BE-NEXT: vmsumubm v2, v2, v4, v3 +; PWR9BE-NEXT: vsumsws v2, v2, v3 ; PWR9BE-NEXT: vextuwlx r3, r3, v2 ; PWR9BE-NEXT: extsw r3, r3 ; PWR9BE-NEXT: blr ; ; PWR10LE-LABEL: v16i8tov16i32_sign: ; PWR10LE: # %bb.0: # %entry -; PWR10LE-NEXT: plxv v3, .LCPI17_0@PCREL(0), 1 -; PWR10LE-NEXT: plxv v4, .LCPI17_1@PCREL(0), 1 -; PWR10LE-NEXT: li r3, 0 -; PWR10LE-NEXT: vperm v3, v2, v2, v3 -; PWR10LE-NEXT: plxv v5, .LCPI17_2@PCREL(0), 1 -; PWR10LE-NEXT: plxv v0, .LCPI17_3@PCREL(0), 1 -; PWR10LE-NEXT: vperm v4, v2, v2, v4 -; PWR10LE-NEXT: vperm v5, v2, v2, v5 -; PWR10LE-NEXT: vperm v2, v2, v2, v0 -; PWR10LE-NEXT: vextsb2w v3, v3 -; PWR10LE-NEXT: vextsb2w v4, v4 -; PWR10LE-NEXT: vextsb2w v5, v5 -; PWR10LE-NEXT: vextsb2w v2, v2 -; PWR10LE-NEXT: vadduwm v2, v2, v5 -; PWR10LE-NEXT: vadduwm v3, v4, v3 -; PWR10LE-NEXT: vadduwm v2, v3, v2 -; PWR10LE-NEXT: xxswapd v3, v2 -; PWR10LE-NEXT: vadduwm v2, v2, v3 -; PWR10LE-NEXT: xxspltw v3, v2, 2 -; PWR10LE-NEXT: vadduwm v2, v2, v3 +; PWR10LE-NEXT: xxspltib v4, 1 +; PWR10LE-NEXT: xxlxor v3, v3, v3 +; PWR10LE-NEXT: li r3, 12 +; PWR10LE-NEXT: vmsumubm v2, v2, v4, v3 +; PWR10LE-NEXT: vsumsws v2, v2, v3 ; PWR10LE-NEXT: vextuwrx r3, r3, v2 ; PWR10LE-NEXT: extsw r3, r3 ; PWR10LE-NEXT: blr ; ; PWR10BE-LABEL: v16i8tov16i32_sign: ; PWR10BE: # %bb.0: # %entry -; PWR10BE-NEXT: addis r3, r2, .LCPI17_0@toc@ha -; PWR10BE-NEXT: addi r3, r3, .LCPI17_0@toc@l -; PWR10BE-NEXT: lxv v3, 0(r3) -; PWR10BE-NEXT: addis r3, r2, .LCPI17_1@toc@ha -; PWR10BE-NEXT: addi r3, r3, .LCPI17_1@toc@l -; PWR10BE-NEXT: lxv v4, 0(r3) -; PWR10BE-NEXT: addis r3, r2, .LCPI17_2@toc@ha -; PWR10BE-NEXT: addi r3, r3, .LCPI17_2@toc@l -; PWR10BE-NEXT: vperm v3, v2, v2, v3 -; PWR10BE-NEXT: lxv v5, 0(r3) -; PWR10BE-NEXT: addis r3, r2, .LCPI17_3@toc@ha -; PWR10BE-NEXT: vextsb2w v3, v3 -; PWR10BE-NEXT: addi r3, r3, .LCPI17_3@toc@l -; PWR10BE-NEXT: vperm v4, v2, v2, v4 -; PWR10BE-NEXT: lxv v0, 0(r3) -; PWR10BE-NEXT: li r3, 0 -; PWR10BE-NEXT: vextsb2w v4, v4 -; PWR10BE-NEXT: vperm v5, v2, v2, v5 -; PWR10BE-NEXT: vadduwm v3, v4, v3 -; PWR10BE-NEXT: vextsb2w v5, v5 -; PWR10BE-NEXT: vperm v2, v2, v2, v0 -; PWR10BE-NEXT: vextsb2w v2, v2 -; PWR10BE-NEXT: vadduwm v2, v2, v5 -; PWR10BE-NEXT: vadduwm v2, v3, v2 -; PWR10BE-NEXT: xxswapd v3, v2 -; PWR10BE-NEXT: vadduwm v2, v2, v3 -; PWR10BE-NEXT: xxspltw v3, v2, 1 -; PWR10BE-NEXT: vadduwm v2, v2, v3 +; PWR10BE-NEXT: xxspltib v4, 1 +; PWR10BE-NEXT: xxlxor v3, v3, v3 +; PWR10BE-NEXT: li r3, 12 +; PWR10BE-NEXT: vmsumubm v2, v2, v4, v3 +; PWR10BE-NEXT: vsumsws v2, v2, v3 ; PWR10BE-NEXT: vextuwlx r3, r3, v2 ; PWR10BE-NEXT: extsw r3, r3 ; PWR10BE-NEXT: blr @@ -1116,113 +968,41 @@ define dso_local zeroext i32 @v16i8tov16i32_zero(<16 x i8> %a) local_unnamed_addr #0 { ; PWR9LE-LABEL: v16i8tov16i32_zero: ; PWR9LE: # %bb.0: # %entry -; PWR9LE-NEXT: addis r3, r2, .LCPI18_0@toc@ha -; PWR9LE-NEXT: xxlxor v4, v4, v4 -; PWR9LE-NEXT: addi r3, r3, .LCPI18_0@toc@l -; PWR9LE-NEXT: lxv v3, 0(r3) -; PWR9LE-NEXT: addis r3, r2, .LCPI18_1@toc@ha -; PWR9LE-NEXT: addi r3, r3, .LCPI18_1@toc@l -; PWR9LE-NEXT: lxv v5, 0(r3) -; PWR9LE-NEXT: addis r3, r2, .LCPI18_2@toc@ha -; PWR9LE-NEXT: vperm v3, v4, v2, v3 -; PWR9LE-NEXT: addi r3, r3, .LCPI18_2@toc@l -; PWR9LE-NEXT: lxv v0, 0(r3) -; PWR9LE-NEXT: addis r3, r2, .LCPI18_3@toc@ha -; PWR9LE-NEXT: vperm v5, v4, v2, v5 -; PWR9LE-NEXT: addi r3, r3, .LCPI18_3@toc@l -; PWR9LE-NEXT: lxv v1, 0(r3) -; PWR9LE-NEXT: vadduwm v3, v5, v3 -; PWR9LE-NEXT: li r3, 0 -; PWR9LE-NEXT: vperm v0, v4, v2, v0 -; PWR9LE-NEXT: vperm v2, v4, v2, v1 -; PWR9LE-NEXT: vadduwm v2, v2, v0 -; PWR9LE-NEXT: vadduwm v2, v3, v2 -; PWR9LE-NEXT: xxswapd v3, v2 -; PWR9LE-NEXT: vadduwm v2, v2, v3 -; PWR9LE-NEXT: xxspltw v3, v2, 2 -; PWR9LE-NEXT: vadduwm v2, v2, v3 +; PWR9LE-NEXT: xxspltib v4, 1 +; PWR9LE-NEXT: xxlxor v3, v3, v3 +; PWR9LE-NEXT: li r3, 12 +; PWR9LE-NEXT: vmsumubm v2, v2, v4, v3 +; PWR9LE-NEXT: vsumsws v2, v2, v3 ; PWR9LE-NEXT: vextuwrx r3, r3, v2 ; PWR9LE-NEXT: blr ; ; PWR9BE-LABEL: v16i8tov16i32_zero: ; PWR9BE: # %bb.0: # %entry -; PWR9BE-NEXT: addis r3, r2, .LCPI18_0@toc@ha -; PWR9BE-NEXT: xxlxor v4, v4, v4 -; PWR9BE-NEXT: addi r3, r3, .LCPI18_0@toc@l -; PWR9BE-NEXT: lxv v3, 0(r3) -; PWR9BE-NEXT: addis r3, r2, .LCPI18_1@toc@ha -; PWR9BE-NEXT: addi r3, r3, .LCPI18_1@toc@l -; PWR9BE-NEXT: lxv v5, 0(r3) -; PWR9BE-NEXT: addis r3, r2, .LCPI18_2@toc@ha -; PWR9BE-NEXT: vperm v3, v4, v2, v3 -; PWR9BE-NEXT: addi r3, r3, .LCPI18_2@toc@l -; PWR9BE-NEXT: lxv v0, 0(r3) -; PWR9BE-NEXT: addis r3, r2, .LCPI18_3@toc@ha -; PWR9BE-NEXT: vperm v5, v4, v2, v5 -; PWR9BE-NEXT: addi r3, r3, .LCPI18_3@toc@l -; PWR9BE-NEXT: lxv v1, 0(r3) -; PWR9BE-NEXT: vadduwm v3, v5, v3 -; PWR9BE-NEXT: li r3, 0 -; PWR9BE-NEXT: vperm v0, v4, v2, v0 -; PWR9BE-NEXT: vperm v2, v4, v2, v1 -; PWR9BE-NEXT: vadduwm v2, v2, v0 -; PWR9BE-NEXT: vadduwm v2, v3, v2 -; PWR9BE-NEXT: xxswapd v3, v2 -; PWR9BE-NEXT: vadduwm v2, v2, v3 -; PWR9BE-NEXT: xxspltw v3, v2, 1 -; PWR9BE-NEXT: vadduwm v2, v2, v3 +; PWR9BE-NEXT: xxspltib v4, 1 +; PWR9BE-NEXT: xxlxor v3, v3, v3 +; PWR9BE-NEXT: li r3, 12 +; PWR9BE-NEXT: vmsumubm v2, v2, v4, v3 +; PWR9BE-NEXT: vsumsws v2, v2, v3 ; PWR9BE-NEXT: vextuwlx r3, r3, v2 ; PWR9BE-NEXT: blr ; ; PWR10LE-LABEL: v16i8tov16i32_zero: ; PWR10LE: # %bb.0: # %entry -; PWR10LE-NEXT: plxv v3, .LCPI18_0@PCREL(0), 1 -; PWR10LE-NEXT: plxv v5, .LCPI18_1@PCREL(0), 1 -; PWR10LE-NEXT: xxlxor v4, v4, v4 -; PWR10LE-NEXT: li r3, 0 -; PWR10LE-NEXT: vperm v3, v4, v2, v3 -; PWR10LE-NEXT: plxv v0, .LCPI18_2@PCREL(0), 1 -; PWR10LE-NEXT: plxv v1, .LCPI18_3@PCREL(0), 1 -; PWR10LE-NEXT: vperm v5, v4, v2, v5 -; PWR10LE-NEXT: vperm v0, v4, v2, v0 -; PWR10LE-NEXT: vperm v2, v4, v2, v1 -; PWR10LE-NEXT: vadduwm v2, v2, v0 -; PWR10LE-NEXT: vadduwm v3, v5, v3 -; PWR10LE-NEXT: vadduwm v2, v3, v2 -; PWR10LE-NEXT: xxswapd v3, v2 -; PWR10LE-NEXT: vadduwm v2, v2, v3 -; PWR10LE-NEXT: xxspltw v3, v2, 2 -; PWR10LE-NEXT: vadduwm v2, v2, v3 +; PWR10LE-NEXT: xxspltib v4, 1 +; PWR10LE-NEXT: xxlxor v3, v3, v3 +; PWR10LE-NEXT: li r3, 12 +; PWR10LE-NEXT: vmsumubm v2, v2, v4, v3 +; PWR10LE-NEXT: vsumsws v2, v2, v3 ; PWR10LE-NEXT: vextuwrx r3, r3, v2 ; PWR10LE-NEXT: blr ; ; PWR10BE-LABEL: v16i8tov16i32_zero: ; PWR10BE: # %bb.0: # %entry -; PWR10BE-NEXT: addis r3, r2, .LCPI18_0@toc@ha -; PWR10BE-NEXT: xxlxor v4, v4, v4 -; PWR10BE-NEXT: addi r3, r3, .LCPI18_0@toc@l -; PWR10BE-NEXT: lxv v3, 0(r3) -; PWR10BE-NEXT: addis r3, r2, .LCPI18_1@toc@ha -; PWR10BE-NEXT: addi r3, r3, .LCPI18_1@toc@l -; PWR10BE-NEXT: lxv v5, 0(r3) -; PWR10BE-NEXT: addis r3, r2, .LCPI18_2@toc@ha -; PWR10BE-NEXT: addi r3, r3, .LCPI18_2@toc@l -; PWR10BE-NEXT: vperm v3, v4, v2, v3 -; PWR10BE-NEXT: lxv v0, 0(r3) -; PWR10BE-NEXT: addis r3, r2, .LCPI18_3@toc@ha -; PWR10BE-NEXT: addi r3, r3, .LCPI18_3@toc@l -; PWR10BE-NEXT: vperm v5, v4, v2, v5 -; PWR10BE-NEXT: lxv v1, 0(r3) -; PWR10BE-NEXT: li r3, 0 -; PWR10BE-NEXT: vadduwm v3, v5, v3 -; PWR10BE-NEXT: vperm v0, v4, v2, v0 -; PWR10BE-NEXT: vperm v2, v4, v2, v1 -; PWR10BE-NEXT: vadduwm v2, v2, v0 -; PWR10BE-NEXT: vadduwm v2, v3, v2 -; PWR10BE-NEXT: xxswapd v3, v2 -; PWR10BE-NEXT: vadduwm v2, v2, v3 -; PWR10BE-NEXT: xxspltw v3, v2, 1 -; PWR10BE-NEXT: vadduwm v2, v2, v3 +; PWR10BE-NEXT: xxspltib v4, 1 +; PWR10BE-NEXT: xxlxor v3, v3, v3 +; PWR10BE-NEXT: li r3, 12 +; PWR10BE-NEXT: vmsumubm v2, v2, v4, v3 +; PWR10BE-NEXT: vsumsws v2, v2, v3 ; PWR10BE-NEXT: vextuwlx r3, r3, v2 ; PWR10BE-NEXT: blr entry: