Index: lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- lib/Target/PowerPC/PPCISelLowering.h +++ lib/Target/PowerPC/PPCISelLowering.h @@ -460,6 +460,10 @@ /// v2f32 value into the lower half of a VSR register. LD_VSX_LH, + /// VSRC, CHAIN = LD_SPLAT, CHAIN, Ptr - a splatting load memory + /// instructions such as LXVDSX, LXVWSX. + LD_SPLAT, + /// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian. /// Maps directly to an stxvd2x instruction that will be preceded by /// an xxswapd. @@ -563,9 +567,11 @@ bool isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, unsigned &InsertAtByte, bool &Swap, bool IsLE); - /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the - /// specified isSplatShuffleMask VECTOR_SHUFFLE mask. - unsigned getVSPLTImmediate(SDNode *N, unsigned EltSize, SelectionDAG &DAG); + /// getSplatIdxForPPCMnemonics - Return the splat index as a value that is + /// appropriate for PPC mnemonics (which have a big endian bias - namely + /// elements are counted from the left of the vector register). + unsigned getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize, + SelectionDAG &DAG); /// get_VSPLTI_elt - If this is a build_vector of constants which can be /// formed by using a vspltis[bhw] instruction of the specified element Index: lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCISelLowering.cpp +++ lib/Target/PowerPC/PPCISelLowering.cpp @@ -1396,6 +1396,7 @@ case PPCISD::EXTRACT_SPE: return "PPCISD::EXTRACT_SPE"; case PPCISD::EXTSWSLI: return "PPCISD::EXTSWSLI"; case PPCISD::LD_VSX_LH: return "PPCISD::LD_VSX_LH"; + case PPCISD::LD_SPLAT: return "PPCISD::LD_SPLAT"; case PPCISD::FP_EXTEND_LH: return "PPCISD::FP_EXTEND_LH"; } return nullptr; @@ -1769,10 +1770,10 @@ /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a splat of a single element that is suitable for input to -/// VSPLTB/VSPLTH/VSPLTW. +/// one of the splat operations (VSPLTB/VSPLTH/VSPLTW/XXSPLTW/LXVDSX/etc.). bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) { - assert(N->getValueType(0) == MVT::v16i8 && - (EltSize == 1 || EltSize == 2 || EltSize == 4)); + assert(N->getValueType(0) == MVT::v16i8 && isPowerOf2_32(EltSize) && + EltSize <= 8 && "Can only handle 1,2,4,8 byte element sizes"); // The consecutive indices need to specify an element, not part of two // different elements. So abandon ship early if this isn't the case. @@ -2065,10 +2066,11 @@ } -/// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the -/// specified isSplatShuffleMask VECTOR_SHUFFLE mask. -unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize, - SelectionDAG &DAG) { +/// getSplatIdxForPPCMnemonics - Return the splat index as a value that is +/// appropriate for PPC mnemonics (which have a big endian bias - namely +/// elements are counted from the left of the vector register). +unsigned PPC::getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize, + SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast(N); assert(isSplatShuffleMask(SVOp, EltSize)); if (DAG.getDataLayout().isLittleEndian()) @@ -8138,6 +8140,18 @@ Op0.getOperand(1)); } +const SDValue *getNormalLoadInput(const SDValue &Op) { + const SDValue *InputLoad = &Op; + if (InputLoad->getOpcode() == ISD::BITCAST) + InputLoad = &InputLoad->getOperand(0); + if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR) + InputLoad = &InputLoad->getOperand(0); + if (InputLoad->getOpcode() != ISD::LOAD) + return nullptr; + LoadSDNode *LD = cast(*InputLoad); + return ISD::isNormalLoad(LD) ? InputLoad : nullptr; +} + // If this is a case we can't handle, return null and let the default // expansion code take care of it. If we CAN select this case, and if it // selects to a single instruction, return Op. Otherwise, if we can codegen @@ -8260,6 +8274,34 @@ if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize, HasAnyUndefs, 0, !Subtarget.isLittleEndian()) || SplatBitSize > 32) { + + const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0)); + // Handle load-and-splat patterns as we have instructions that will do this + // in one go. + if (InputLoad && DAG.isSplatValue(Op, true)) { + LoadSDNode *LD = cast(*InputLoad); + + // We have handling for 4 and 8 byte elements. + unsigned ElementSize = LD->getMemoryVT().getScalarSizeInBits(); + + // Checking for a single use of this load, we have to check for vector + // width (128 bits) / ElementSize uses (since each operand of the + // BUILD_VECTOR is a separate use of the value. + if (InputLoad->getNode()->hasNUsesOfValue(128 / ElementSize, 0) && + ((Subtarget.hasVSX() && ElementSize == 64) || + (Subtarget.hasP9Vector() && ElementSize == 32))) { + SDValue Ops[] = { + LD->getChain(), // Chain + LD->getBasePtr(), // Ptr + DAG.getValueType(Op.getValueType()) // VT + }; + return + DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl, + DAG.getVTList(Op.getValueType(), MVT::Other), + Ops, LD->getMemoryVT(), LD->getMemOperand()); + } + } + // BUILD_VECTOR nodes that are not constant splats of up to 32-bits can be // lowered to VSX instructions under certain conditions. // Without VSX, there is no pattern more efficient than expanding the node. @@ -8745,6 +8787,45 @@ unsigned ShiftElts, InsertAtByte; bool Swap = false; + + // If this is a load-and-splat, we can do that with a single instruction + // in some cases. However if the load has multiple uses, we don't want to + // combine it because that will just produce multiple loads. + const SDValue *InputLoad = getNormalLoadInput(V1); + if (InputLoad && Subtarget.hasVSX() && V2.isUndef() && + (PPC::isSplatShuffleMask(SVOp, 4) || PPC::isSplatShuffleMask(SVOp, 8)) && + InputLoad->hasOneUse()) { + bool IsFourByte = PPC::isSplatShuffleMask(SVOp, 4); + int SplatIdx = + PPC::getSplatIdxForPPCMnemonics(SVOp, IsFourByte ? 4 : 8, DAG); + + LoadSDNode *LD = cast(*InputLoad); + // For 4-byte load-and-splat, we need Power9. + if ((IsFourByte && Subtarget.hasP9Vector()) || !IsFourByte) { + uint64_t Offset = 0; + if (IsFourByte) + Offset = isLittleEndian ? (3 - SplatIdx) * 4 : SplatIdx * 4; + else + Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8; + SDValue BasePtr = LD->getBasePtr(); + if (Offset != 0) + BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), + BasePtr, DAG.getIntPtrConstant(Offset, dl)); + SDValue Ops[] = { + LD->getChain(), // Chain + BasePtr, // BasePtr + DAG.getValueType(Op.getValueType()) // VT + }; + SDVTList VTL = + DAG.getVTList(IsFourByte ? MVT::v4i32 : MVT::v2i64, MVT::Other); + SDValue LdSplt = + DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl, VTL, + Ops, LD->getMemoryVT(), LD->getMemOperand()); + if (LdSplt.getValueType() != SVOp->getValueType(0)) + LdSplt = DAG.getBitcast(SVOp->getValueType(0), LdSplt); + return LdSplt; + } + } if (Subtarget.hasP9Vector() && PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap, isLittleEndian)) { @@ -8821,7 +8902,7 @@ if (Subtarget.hasVSX()) { if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) { - int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG); + int SplatIdx = PPC::getSplatIdxForPPCMnemonics(SVOp, 4, DAG); SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1); SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv, Index: lib/Target/PowerPC/PPCInstrAltivec.td =================================================================== --- lib/Target/PowerPC/PPCInstrAltivec.td +++ lib/Target/PowerPC/PPCInstrAltivec.td @@ -215,21 +215,21 @@ // VSPLT*_get_imm xform function: convert vector_shuffle mask to VSPLT* imm. def VSPLTB_get_imm : SDNodeXForm; def vspltb_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle node:$lhs, node:$rhs), [{ return PPC::isSplatShuffleMask(cast(N), 1); }], VSPLTB_get_imm>; def VSPLTH_get_imm : SDNodeXForm; def vsplth_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle node:$lhs, node:$rhs), [{ return PPC::isSplatShuffleMask(cast(N), 2); }], VSPLTH_get_imm>; def VSPLTW_get_imm : SDNodeXForm; def vspltw_shuffle : PatFrag<(ops node:$lhs, node:$rhs), (vector_shuffle node:$lhs, node:$rhs), [{ Index: lib/Target/PowerPC/PPCInstrVSX.td =================================================================== --- lib/Target/PowerPC/PPCInstrVSX.td +++ lib/Target/PowerPC/PPCInstrVSX.td @@ -58,6 +58,10 @@ SDTCisVT<0, v4f32>, SDTCisPtrTy<1> ]>; +def SDT_PPCldsplat : SDTypeProfile<1, 1, [ + SDTCisVec<0>, SDTCisPtrTy<1> +]>; + def SDT_PPCfpextlh : SDTypeProfile<1, 1, [ SDTCisVT<0, v2f64>, SDTCisVT<1, v4f32> ]>; @@ -96,6 +100,8 @@ def PPCfpextlh : SDNode<"PPCISD::FP_EXTEND_LH", SDT_PPCfpextlh, []>; def PPCldvsxlh : SDNode<"PPCISD::LD_VSX_LH", SDT_PPCldvsxlh, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def PPCldsplat : SDNode<"PPCISD::LD_SPLAT", SDT_PPCldsplat, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; multiclass XX3Form_Rcr opcode, bits<7> xo, string asmbase, string asmstr, InstrItinClass itin, Intrinsic Int, @@ -3858,6 +3864,10 @@ (XSCVDPUXWSs (XFLOADf32 xoaddr:$A)), VSRC), 1))>; def : Pat<(v4f32 (build_vector f32:$A, f32:$A, f32:$A, f32:$A)), (v4f32 (XXSPLTW (v4f32 (XSCVDPSPN $A)), 0))>; + def : Pat<(v2f64 (PPCldsplat xoaddr:$A)), + (v2f64 (LXVDSX xoaddr:$A))>; + def : Pat<(v2i64 (PPCldsplat xoaddr:$A)), + (v2i64 (LXVDSX xoaddr:$A))>; // Build vectors of floating point converted to i64. def : Pat<(v2i64 (build_vector FltToLong.A, FltToLong.A)), @@ -4102,6 +4112,10 @@ (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS (DFLOADf32 iaddrX4:$A), VSFRC)), 0))>; + def : Pat<(v4f32 (PPCldsplat xoaddr:$A)), + (v4f32 (LXVWSX xoaddr:$A))>; + def : Pat<(v4i32 (PPCldsplat xoaddr:$A)), + (v4i32 (LXVWSX xoaddr:$A))>; } let Predicates = [IsISA3_0, HasDirectMove, IsBigEndian] in { Index: test/CodeGen/PowerPC/VSX-XForm-Scalars.ll =================================================================== --- test/CodeGen/PowerPC/VSX-XForm-Scalars.ll +++ test/CodeGen/PowerPC/VSX-XForm-Scalars.ll @@ -27,18 +27,16 @@ ; ; CHECK-P9-LABEL: testExpandPostRAPseudo: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9: lfiwzx f0, 0, r3 ; CHECK-P9: addis r4, r2, .LC0@toc@ha +; CHECK-P9: lxvwsx vs0, 0, r3 ; CHECK-P9: ld r4, .LC0@toc@l(r4) -; CHECK-P9: xxpermdi vs0, f0, f0, 2 -; CHECK-P9: xxspltw vs0, vs0, 3 ; CHECK-P9: stxvx vs0, 0, r4 +; CHECK-P9: lis r4, 1024 ; CHECK-P9: lfiwax f0, 0, r3 ; CHECK-P9: addis r3, r2, .LC1@toc@ha ; CHECK-P9: ld r3, .LC1@toc@l(r3) ; CHECK-P9: xscvsxdsp f0, f0 ; CHECK-P9: ld r3, 0(r3) -; CHECK-P9: lis r4, 1024 ; CHECK-P9: stfsx f0, r3, r4 ; CHECK-P9: blr entry: Index: test/CodeGen/PowerPC/build-vector-tests.ll =================================================================== --- test/CodeGen/PowerPC/build-vector-tests.ll +++ test/CodeGen/PowerPC/build-vector-tests.ll @@ -1327,16 +1327,12 @@ define <4 x i32> @spltMemVali(i32* nocapture readonly %ptr) { ; P9BE-LABEL: spltMemVali: ; P9BE: # %bb.0: # %entry -; P9BE-NEXT: lfiwzx f0, 0, r3 -; P9BE-NEXT: xxsldwi vs0, f0, f0, 1 -; P9BE-NEXT: xxspltw v2, vs0, 0 +; P9BE-NEXT: lxvwsx v2, 0, r3 ; P9BE-NEXT: blr ; ; P9LE-LABEL: spltMemVali: ; P9LE: # %bb.0: # %entry -; P9LE-NEXT: lfiwzx f0, 0, r3 -; P9LE-NEXT: xxpermdi vs0, f0, f0, 2 -; P9LE-NEXT: xxspltw v2, vs0, 3 +; P9LE-NEXT: lxvwsx v2, 0, r3 ; P9LE-NEXT: blr ; ; P8BE-LABEL: spltMemVali: @@ -2911,16 +2907,12 @@ define <4 x i32> @spltMemValui(i32* nocapture readonly %ptr) { ; P9BE-LABEL: spltMemValui: ; P9BE: # %bb.0: # %entry -; P9BE-NEXT: lfiwzx f0, 0, r3 -; P9BE-NEXT: xxsldwi vs0, f0, f0, 1 -; P9BE-NEXT: xxspltw v2, vs0, 0 +; P9BE-NEXT: lxvwsx v2, 0, r3 ; P9BE-NEXT: blr ; ; P9LE-LABEL: spltMemValui: ; P9LE: # %bb.0: # %entry -; P9LE-NEXT: lfiwzx f0, 0, r3 -; P9LE-NEXT: xxpermdi vs0, f0, f0, 2 -; P9LE-NEXT: xxspltw v2, vs0, 3 +; P9LE-NEXT: lxvwsx v2, 0, r3 ; P9LE-NEXT: blr ; ; P8BE-LABEL: spltMemValui: Index: test/CodeGen/PowerPC/load-and-splat.ll =================================================================== --- test/CodeGen/PowerPC/load-and-splat.ll +++ test/CodeGen/PowerPC/load-and-splat.ll @@ -0,0 +1,264 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mtriple=powerpc64-unknown-unknown < %s | FileCheck %s \ +; RUN: -check-prefix=P9 +; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s \ +; RUN: -check-prefix=P8 +define dso_local void @test(<2 x double>* nocapture %c, double* nocapture readonly %a) local_unnamed_addr { +; P9-LABEL: test: +; P9: # %bb.0: # %entry +; P9-NEXT: addi r4, r4, 24 +; P9-NEXT: lxvdsx vs0, 0, r4 +; P9-NEXT: stxv vs0, 0(r3) +; P9-NEXT: blr +; +; P8-LABEL: test: +; P8: # %bb.0: # %entry +; P8-NEXT: addi r4, r4, 24 +; P8-NEXT: lxvdsx vs0, 0, r4 +; P8-NEXT: stxvd2x vs0, 0, r3 +; P8-NEXT: blr +entry: + %arrayidx = getelementptr inbounds double, double* %a, i64 3 + %0 = load double, double* %arrayidx, align 8 + %splat.splatinsert.i = insertelement <2 x double> undef, double %0, i32 0 + %splat.splat.i = shufflevector <2 x double> %splat.splatinsert.i, <2 x double> undef, <2 x i32> zeroinitializer + store <2 x double> %splat.splat.i, <2 x double>* %c, align 16 + ret void +} + +define dso_local void @test2(<4 x float>* nocapture %c, float* nocapture readonly %a) local_unnamed_addr { +; P9-LABEL: test2: +; P9: # %bb.0: # %entry +; P9-NEXT: addi r4, r4, 12 +; P9-NEXT: lxvwsx vs0, 0, r4 +; P9-NEXT: stxv vs0, 0(r3) +; P9-NEXT: blr +; +; P8-LABEL: test2: +; P8: # %bb.0: # %entry +; P8-NEXT: addi r4, r4, 12 +; P8-NEXT: lfiwzx f0, 0, r4 +; P8-NEXT: xxpermdi vs0, f0, f0, 2 +; P8-NEXT: xxspltw v2, vs0, 3 +; P8-NEXT: stvx v2, 0, r3 +; P8-NEXT: blr +entry: + %arrayidx = getelementptr inbounds float, float* %a, i64 3 + %0 = load float, float* %arrayidx, align 4 + %splat.splatinsert.i = insertelement <4 x float> undef, float %0, i32 0 + %splat.splat.i = shufflevector <4 x float> %splat.splatinsert.i, <4 x float> undef, <4 x i32> zeroinitializer + store <4 x float> %splat.splat.i, <4 x float>* %c, align 16 + ret void +} + +define dso_local void @test3(<4 x i32>* nocapture %c, i32* nocapture readonly %a) local_unnamed_addr { +; P9-LABEL: test3: +; P9: # %bb.0: # %entry +; P9-NEXT: addi r4, r4, 12 +; P9-NEXT: lxvwsx vs0, 0, r4 +; P9-NEXT: stxv vs0, 0(r3) +; P9-NEXT: blr +; +; P8-LABEL: test3: +; P8: # %bb.0: # %entry +; P8-NEXT: addi r4, r4, 12 +; P8-NEXT: lfiwzx f0, 0, r4 +; P8-NEXT: xxpermdi vs0, f0, f0, 2 +; P8-NEXT: xxspltw v2, vs0, 3 +; P8-NEXT: stvx v2, 0, r3 +; P8-NEXT: blr +entry: + %arrayidx = getelementptr inbounds i32, i32* %a, i64 3 + %0 = load i32, i32* %arrayidx, align 4 + %splat.splatinsert.i = insertelement <4 x i32> undef, i32 %0, i32 0 + %splat.splat.i = shufflevector <4 x i32> %splat.splatinsert.i, <4 x i32> undef, <4 x i32> zeroinitializer + store <4 x i32> %splat.splat.i, <4 x i32>* %c, align 16 + ret void +} + +define dso_local void @test4(<2 x i64>* nocapture %c, i64* nocapture readonly %a) local_unnamed_addr { +; P9-LABEL: test4: +; P9: # %bb.0: # %entry +; P9-NEXT: addi r4, r4, 24 +; P9-NEXT: lxvdsx vs0, 0, r4 +; P9-NEXT: stxv vs0, 0(r3) +; P9-NEXT: blr +; +; P8-LABEL: test4: +; P8: # %bb.0: # %entry +; P8-NEXT: addi r4, r4, 24 +; P8-NEXT: lxvdsx vs0, 0, r4 +; P8-NEXT: stxvd2x vs0, 0, r3 +; P8-NEXT: blr +entry: + %arrayidx = getelementptr inbounds i64, i64* %a, i64 3 + %0 = load i64, i64* %arrayidx, align 8 + %splat.splatinsert.i = insertelement <2 x i64> undef, i64 %0, i32 0 + %splat.splat.i = shufflevector <2 x i64> %splat.splatinsert.i, <2 x i64> undef, <2 x i32> zeroinitializer + store <2 x i64> %splat.splat.i, <2 x i64>* %c, align 16 + ret void +} + +define <16 x i8> @unadjusted_lxvwsx(i32* %s, i32* %t) { +; P9-LABEL: unadjusted_lxvwsx: +; P9: # %bb.0: # %entry +; P9-NEXT: lxvwsx v2, 0, r3 +; P9-NEXT: blr +; +; P8-LABEL: unadjusted_lxvwsx: +; P8: # %bb.0: # %entry +; P8-NEXT: lfiwzx f0, 0, r3 +; P8-NEXT: xxpermdi vs0, f0, f0, 2 +; P8-NEXT: xxspltw v2, vs0, 3 +; P8-NEXT: blr + entry: + %0 = bitcast i32* %s to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 4 + %2 = shufflevector <4 x i8> %1, <4 x i8> undef, <16 x i32> + ret <16 x i8> %2 +} + +define <16 x i8> @adjusted_lxvwsx(i64* %s, i64* %t) { +; P9-LABEL: adjusted_lxvwsx: +; P9: # %bb.0: # %entry +; P9-NEXT: addi r3, r3, 4 +; P9-NEXT: lxvwsx v2, 0, r3 +; P9-NEXT: blr +; +; P8-LABEL: adjusted_lxvwsx: +; P8: # %bb.0: # %entry +; P8-NEXT: ld r3, 0(r3) +; P8-NEXT: mtvsrd f0, r3 +; P8-NEXT: xxswapd v2, vs0 +; P8-NEXT: xxspltw v2, v2, 2 +; P8-NEXT: blr + entry: + %0 = bitcast i64* %s to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <16 x i32> + ret <16 x i8> %2 +} + +define <16 x i8> @unadjusted_lxvwsx_v16i8(<16 x i8> *%s, <16 x i8> %t) { +; P9-LABEL: unadjusted_lxvwsx_v16i8: +; P9: # %bb.0: # %entry +; P9-NEXT: lxvwsx v2, 0, r3 +; P9-NEXT: blr +; +; P8-LABEL: unadjusted_lxvwsx_v16i8: +; P8: # %bb.0: # %entry +; P8-NEXT: lvx v2, 0, r3 +; P8-NEXT: xxspltw v2, v2, 3 +; P8-NEXT: blr + entry: + %0 = load <16 x i8>, <16 x i8>* %s, align 16 + %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> + ret <16 x i8> %1 +} + +define <16 x i8> @adjusted_lxvwsx_v16i8(<16 x i8> *%s, <16 x i8> %t) { +; P9-LABEL: adjusted_lxvwsx_v16i8: +; P9: # %bb.0: # %entry +; P9-NEXT: addi r3, r3, 4 +; P9-NEXT: lxvwsx v2, 0, r3 +; P9-NEXT: blr +; +; P8-LABEL: adjusted_lxvwsx_v16i8: +; P8: # %bb.0: # %entry +; P8-NEXT: lvx v2, 0, r3 +; P8-NEXT: xxspltw v2, v2, 2 +; P8-NEXT: blr + entry: + %0 = load <16 x i8>, <16 x i8>* %s, align 16 + %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> + ret <16 x i8> %1 +} + +define <16 x i8> @adjusted_lxvwsx_v16i8_2(<16 x i8> *%s, <16 x i8> %t) { +; P9-LABEL: adjusted_lxvwsx_v16i8_2: +; P9: # %bb.0: # %entry +; P9-NEXT: addi r3, r3, 8 +; P9-NEXT: lxvwsx v2, 0, r3 +; P9-NEXT: blr +; +; P8-LABEL: adjusted_lxvwsx_v16i8_2: +; P8: # %bb.0: # %entry +; P8-NEXT: lvx v2, 0, r3 +; P8-NEXT: xxspltw v2, v2, 1 +; P8-NEXT: blr + entry: + %0 = load <16 x i8>, <16 x i8>* %s, align 16 + %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> + ret <16 x i8> %1 +} + +define <16 x i8> @adjusted_lxvwsx_v16i8_3(<16 x i8> *%s, <16 x i8> %t) { +; P9-LABEL: adjusted_lxvwsx_v16i8_3: +; P9: # %bb.0: # %entry +; P9-NEXT: addi r3, r3, 12 +; P9-NEXT: lxvwsx v2, 0, r3 +; P9-NEXT: blr +; +; P8-LABEL: adjusted_lxvwsx_v16i8_3: +; P8: # %bb.0: # %entry +; P8-NEXT: lvx v2, 0, r3 +; P8-NEXT: xxspltw v2, v2, 0 +; P8-NEXT: blr + entry: + %0 = load <16 x i8>, <16 x i8>* %s, align 16 + %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> + ret <16 x i8> %1 +} + +define <16 x i8> @unadjusted_lxvdsx(i64* %s, i64* %t) { +; P9-LABEL: unadjusted_lxvdsx: +; P9: # %bb.0: # %entry +; P9-NEXT: lxvdsx v2, 0, r3 +; P9-NEXT: blr +; +; P8-LABEL: unadjusted_lxvdsx: +; P8: # %bb.0: # %entry +; P8-NEXT: lxvdsx v2, 0, r3 +; P8-NEXT: blr + entry: + %0 = bitcast i64* %s to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 8 + %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <16 x i32> + ret <16 x i8> %2 +} + +define <16 x i8> @unadjusted_lxvdsx_v16i8(<16 x i8> *%s, <16 x i8> %t) { +; P9-LABEL: unadjusted_lxvdsx_v16i8: +; P9: # %bb.0: # %entry +; P9-NEXT: lxvdsx v2, 0, r3 +; P9-NEXT: blr +; +; P8-LABEL: unadjusted_lxvdsx_v16i8: +; P8: # %bb.0: # %entry +; P8-NEXT: lxvdsx v2, 0, r3 +; P8-NEXT: blr + entry: + %0 = load <16 x i8>, <16 x i8>* %s, align 16 + %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> + ret <16 x i8> %1 +} + +define <16 x i8> @adjusted_lxvdsx_v16i8(<16 x i8> *%s, <16 x i8> %t) { +; P9-LABEL: adjusted_lxvdsx_v16i8: +; P9: # %bb.0: # %entry +; P9-NEXT: addi r3, r3, 8 +; P9-NEXT: lxvdsx v2, 0, r3 +; P9-NEXT: blr +; +; P8-LABEL: adjusted_lxvdsx_v16i8: +; P8: # %bb.0: # %entry +; P8-NEXT: addi r3, r3, 8 +; P8-NEXT: lxvdsx v2, 0, r3 +; P8-NEXT: blr + entry: + %0 = load <16 x i8>, <16 x i8>* %s, align 16 + %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> + ret <16 x i8> %1 +} Index: test/CodeGen/PowerPC/power9-moves-and-splats.ll =================================================================== --- test/CodeGen/PowerPC/power9-moves-and-splats.ll +++ test/CodeGen/PowerPC/power9-moves-and-splats.ll @@ -61,16 +61,12 @@ define <4 x i32> @test4(i32* nocapture readonly %in) { ; CHECK-LABEL: test4: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lfiwzx f0, 0, r3 -; CHECK-NEXT: xxpermdi vs0, f0, f0, 2 -; CHECK-NEXT: xxspltw v2, vs0, 3 +; CHECK-NEXT: lxvwsx v2, 0, r3 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test4: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lfiwzx f0, 0, r3 -; CHECK-BE-NEXT: xxsldwi vs0, f0, f0, 1 -; CHECK-BE-NEXT: xxspltw v2, vs0, 0 +; CHECK-BE-NEXT: lxvwsx v2, 0, r3 ; CHECK-BE-NEXT: blr entry: @@ -83,16 +79,12 @@ define <4 x float> @test5(float* nocapture readonly %in) { ; CHECK-LABEL: test5: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lfiwzx f0, 0, r3 -; CHECK-NEXT: xxpermdi vs0, f0, f0, 2 -; CHECK-NEXT: xxspltw v2, vs0, 3 +; CHECK-NEXT: lxvwsx v2, 0, r3 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test5: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lfiwzx f0, 0, r3 -; CHECK-BE-NEXT: xxsldwi vs0, f0, f0, 1 -; CHECK-BE-NEXT: xxspltw v2, vs0, 0 +; CHECK-BE-NEXT: lxvwsx v2, 0, r3 ; CHECK-BE-NEXT: blr entry: @@ -107,18 +99,14 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addis r3, r2, .LC0@toc@ha ; CHECK-NEXT: ld r3, .LC0@toc@l(r3) -; CHECK-NEXT: lfiwzx f0, 0, r3 -; CHECK-NEXT: xxpermdi vs0, f0, f0, 2 -; CHECK-NEXT: xxspltw v2, vs0, 3 +; CHECK-NEXT: lxvwsx v2, 0, r3 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test6: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: addis r3, r2, .LC0@toc@ha ; CHECK-BE-NEXT: ld r3, .LC0@toc@l(r3) -; CHECK-BE-NEXT: lfiwzx f0, 0, r3 -; CHECK-BE-NEXT: xxsldwi vs0, f0, f0, 1 -; CHECK-BE-NEXT: xxspltw v2, vs0, 0 +; CHECK-BE-NEXT: lxvwsx v2, 0, r3 ; CHECK-BE-NEXT: blr entry: @@ -133,18 +121,14 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: addis r3, r2, .LC1@toc@ha ; CHECK-NEXT: ld r3, .LC1@toc@l(r3) -; CHECK-NEXT: lfiwzx f0, 0, r3 -; CHECK-NEXT: xxpermdi vs0, f0, f0, 2 -; CHECK-NEXT: xxspltw v2, vs0, 3 +; CHECK-NEXT: lxvwsx v2, 0, r3 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test7: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: addis r3, r2, .LC1@toc@ha ; CHECK-BE-NEXT: ld r3, .LC1@toc@l(r3) -; CHECK-BE-NEXT: lfiwzx f0, 0, r3 -; CHECK-BE-NEXT: xxsldwi vs0, f0, f0, 1 -; CHECK-BE-NEXT: xxspltw v2, vs0, 0 +; CHECK-BE-NEXT: lxvwsx v2, 0, r3 ; CHECK-BE-NEXT: blr entry: Index: test/CodeGen/PowerPC/qpx-load-splat.ll =================================================================== --- test/CodeGen/PowerPC/qpx-load-splat.ll +++ test/CodeGen/PowerPC/qpx-load-splat.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -ppc-vsr-nums-as-vr \ ; RUN: -ppc-asm-full-reg-names -verify-machineinstrs < %s | FileCheck %s @@ -34,9 +35,9 @@ ; CHECK-LABEL: fooxu: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: sldi r4, r4, 3 -; CHECK-NEXT: lfdux f0, r3, r4 -; CHECK-NEXT: xxspltd v2, vs0, 0 -; CHECK-NEXT: std r3, 0(r5) +; CHECK-NEXT: add r6, r3, r4 +; CHECK-NEXT: std r6, 0(r5) +; CHECK-NEXT: lxvdsx v2, r3, r4 ; CHECK-NEXT: vmr v3, v2 ; CHECK-NEXT: blr entry: Index: test/CodeGen/PowerPC/swaps-le-7.ll =================================================================== --- test/CodeGen/PowerPC/swaps-le-7.ll +++ test/CodeGen/PowerPC/swaps-le-7.ll @@ -9,8 +9,8 @@ @G4 = global <2 x double> ; CHECK-LABEL: @zg -; CHECK: xxspltd -; CHECK-NEXT: xxspltd +; CHECK: lxvdsx +; CHECK-NEXT: lxvdsx ; CHECK-NEXT: xvmuldp ; CHECK-DAG: xvmuldp ; CHECK-DAG: xvsubdp