Index: llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h +++ llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h @@ -952,6 +952,8 @@ SDValue LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl) const; + SDValue LowerTRUNCATEVector(SDValue Op, SelectionDAG &DAG) const; + SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const; SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const; Index: llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp +++ llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp @@ -118,6 +118,8 @@ static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int); +static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl); + // FIXME: Remove this once the bug has been fixed! extern cl::opt ANDIGlueBug; @@ -639,6 +641,14 @@ // with merges, splats, etc. setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom); + // Vector truncates to sub-word integer that fit in an Altivec/VSX register + // are cheap, so handle them before they get expanded to scalar. + setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom); + setOperationAction(ISD::AND , MVT::v4i32, Legal); setOperationAction(ISD::OR , MVT::v4i32, Legal); setOperationAction(ISD::XOR , MVT::v4i32, Legal); @@ -6794,6 +6804,61 @@ Op.getOperand(0)); } +SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op, + SelectionDAG &DAG) const { + + // Implements a vector truncate that fits in a vector register as a shuffle. + // We want to legalize vector truncates down to where the source fits in + // a vector register (and target is therefore smaller than vector register + // size). At that point legalization will try to custom lower the sub-legal + // result and get here - where we can contain the truncate as a single target + // operation. + + // For example a trunc <2 x i16> to <2 x i8> could be visualized as follows: + // to + // + // We will implement it for big-endian ordering as this (where x denotes + // undefined): + // < MSB1|LSB1, MSB2|LSB2, uu, uu, uu, uu, uu, uu> to + // < LSB1, LSB2, u, u, u, u, u, u, u, u, u, u, u, u, u, u> + // + // The same operation in little-endian ordering will be: + // to + // + + assert(Op.getValueType().isVector() && "Vector type expected."); + + SDLoc DL(Op); + SDValue N1 = Op.getOperand(0); + unsigned SrcSize = N1.getValueType().getSizeInBits(); + assert(SrcSize <= 128 && "Source must fit in an Altivec/VSX vector"); + SDValue WideSrc = SrcSize == 128 ? N1 : widenVec(DAG, N1, DL); + + EVT TrgVT = Op.getValueType(); + unsigned TrgNumElts = TrgVT.getVectorNumElements(); + EVT EltVT = TrgVT.getVectorElementType(); + unsigned WideNumElts = 128 / EltVT.getSizeInBits(); + EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts); + + // First list the elements we want to keep. + unsigned SizeMult = SrcSize / TrgVT.getSizeInBits(); + SmallVector ShuffV; + if (Subtarget.isLittleEndian()) + for (unsigned i = 0; i < TrgNumElts; ++i) + ShuffV.push_back(i * SizeMult); + else + for (unsigned i = 1; i <= TrgNumElts; ++i) + ShuffV.push_back(i * SizeMult - 1); + + // Populate the remaining elements with undefs. + for (unsigned i = TrgNumElts; i < WideNumElts; ++i) + // ShuffV.push_back(i + WideNumElts); + ShuffV.push_back(WideNumElts + 1); + + SDValue Conv = DAG.getNode(ISD::BITCAST, DL, WideVT, WideSrc); + return DAG.getVectorShuffle(WideVT, DL, Conv, DAG.getUNDEF(WideVT), ShuffV); +} + /// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when /// possible. SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { @@ -9641,6 +9706,14 @@ return; Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl)); return; + case ISD::TRUNCATE: { + EVT TrgVT = N->getValueType(0); + if (TrgVT.isVector() && + isOperationCustom(N->getOpcode(), TrgVT) && + N->getOperand(0).getValueType().getSizeInBits() <= 128) + Results.push_back(LowerTRUNCATEVector(SDValue(N, 0), DAG)); + return; + } case ISD::BITCAST: // Don't handle bitcast here. return; Index: llvm/trunk/test/CodeGen/PowerPC/vec-trunc.ll =================================================================== --- llvm/trunk/test/CodeGen/PowerPC/vec-trunc.ll +++ llvm/trunk/test/CodeGen/PowerPC/vec-trunc.ll @@ -10,90 +10,17 @@ ; CHECK-LABEL: test8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lvx v2, 0, r4 -; CHECK-NEXT: mfvsrd r4, v2 -; CHECK-NEXT: xxswapd vs0, v2 -; CHECK-NEXT: clrldi r5, r4, 48 -; CHECK-NEXT: mtvsrd f1, r5 -; CHECK-NEXT: rldicl r5, r4, 48, 48 -; CHECK-NEXT: mtvsrd f2, r5 -; CHECK-NEXT: rldicl r5, r4, 32, 48 -; CHECK-NEXT: rldicl r4, r4, 16, 48 -; CHECK-NEXT: mtvsrd f3, r5 -; CHECK-NEXT: xxswapd v2, vs1 -; CHECK-NEXT: mfvsrd r5, f0 -; CHECK-NEXT: xxswapd v3, vs2 -; CHECK-NEXT: mtvsrd f0, r4 -; CHECK-NEXT: clrldi r4, r5, 48 -; CHECK-NEXT: mtvsrd f1, r4 -; CHECK-NEXT: rldicl r4, r5, 48, 48 -; CHECK-NEXT: xxswapd v4, vs0 -; CHECK-NEXT: mtvsrd f2, r4 -; CHECK-NEXT: rldicl r4, r5, 32, 48 -; CHECK-NEXT: rldicl r5, r5, 16, 48 -; CHECK-NEXT: vmrglb v2, v3, v2 -; CHECK-NEXT: xxswapd v3, vs3 -; CHECK-NEXT: mtvsrd f3, r4 -; CHECK-NEXT: xxswapd v5, vs1 -; CHECK-NEXT: mtvsrd f0, r5 -; CHECK-NEXT: xxswapd v0, vs2 -; CHECK-NEXT: xxswapd v1, vs3 -; CHECK-NEXT: vmrglb v3, v4, v3 -; CHECK-NEXT: xxswapd v6, vs0 -; CHECK-NEXT: vmrglb v4, v0, v5 -; CHECK-NEXT: vmrglb v5, v6, v1 -; CHECK-NEXT: vmrglh v2, v3, v2 -; CHECK-NEXT: vmrglh v3, v5, v4 -; CHECK-NEXT: vmrglw v2, v2, v3 +; CHECK-NEXT: vpkuhum v2, v2, v2 ; CHECK-NEXT: xxswapd vs0, v2 ; CHECK-NEXT: stfdx f0, 0, r3 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test8i8: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lxvw4x vs0, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -32 -; CHECK-BE-NEXT: stxvw4x vs0, 0, r4 -; CHECK-BE-NEXT: lhz r4, -18(r1) -; CHECK-BE-NEXT: stb r4, -48(r1) -; CHECK-BE-NEXT: lhz r4, -20(r1) -; CHECK-BE-NEXT: stb r4, -64(r1) -; CHECK-BE-NEXT: lhz r4, -22(r1) -; CHECK-BE-NEXT: stb r4, -80(r1) -; CHECK-BE-NEXT: lhz r4, -24(r1) -; CHECK-BE-NEXT: stb r4, -96(r1) -; CHECK-BE-NEXT: lhz r4, -26(r1) -; CHECK-BE-NEXT: stb r4, -112(r1) -; CHECK-BE-NEXT: lhz r4, -28(r1) -; CHECK-BE-NEXT: stb r4, -128(r1) -; CHECK-BE-NEXT: lhz r4, -30(r1) -; CHECK-BE-NEXT: stb r4, -144(r1) -; CHECK-BE-NEXT: lhz r4, -32(r1) -; CHECK-BE-NEXT: stb r4, -160(r1) -; CHECK-BE-NEXT: addi r4, r1, -48 ; CHECK-BE-NEXT: lxvw4x v2, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -64 -; CHECK-BE-NEXT: lxvw4x v3, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -80 -; CHECK-BE-NEXT: lxvw4x v4, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -96 -; CHECK-BE-NEXT: lxvw4x v5, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -112 -; CHECK-BE-NEXT: lxvw4x v0, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -128 -; CHECK-BE-NEXT: lxvw4x v1, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -144 -; CHECK-BE-NEXT: lxvw4x v6, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -160 -; CHECK-BE-NEXT: lxvw4x v7, 0, r4 -; CHECK-BE-NEXT: vmrghb v2, v3, v2 -; CHECK-BE-NEXT: vmrghb v3, v5, v4 -; CHECK-BE-NEXT: vmrghb v4, v1, v0 -; CHECK-BE-NEXT: addi r4, r1, -16 -; CHECK-BE-NEXT: vmrghh v2, v3, v2 -; CHECK-BE-NEXT: vmrghb v5, v7, v6 -; CHECK-BE-NEXT: vmrghh v3, v5, v4 -; CHECK-BE-NEXT: vmrghw v2, v3, v2 -; CHECK-BE-NEXT: stxvd2x v2, 0, r4 +; CHECK-BE-NEXT: addi r5, r1, -16 +; CHECK-BE-NEXT: vpkuhum v2, v2, v2 +; CHECK-BE-NEXT: stxvd2x v2, 0, r5 ; CHECK-BE-NEXT: ld r4, -16(r1) ; CHECK-BE-NEXT: std r4, 0(r3) ; CHECK-BE-NEXT: blr @@ -108,53 +35,17 @@ ; CHECK-LABEL: test4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lvx v2, 0, r4 -; CHECK-NEXT: xxswapd vs0, v2 -; CHECK-NEXT: mfvsrd r4, f0 -; CHECK-NEXT: clrldi r5, r4, 48 -; CHECK-NEXT: mtvsrd f0, r5 -; CHECK-NEXT: rldicl r5, r4, 48, 48 -; CHECK-NEXT: mtvsrd f1, r5 -; CHECK-NEXT: rldicl r5, r4, 32, 48 -; CHECK-NEXT: rldicl r4, r4, 16, 48 -; CHECK-NEXT: mtvsrd f2, r5 -; CHECK-NEXT: xxswapd v2, vs0 -; CHECK-NEXT: mtvsrd f3, r4 -; CHECK-NEXT: xxswapd v3, vs1 -; CHECK-NEXT: xxswapd v4, vs2 -; CHECK-NEXT: xxswapd v5, vs3 -; CHECK-NEXT: vmrglb v2, v3, v2 -; CHECK-NEXT: vmrglb v3, v5, v4 -; CHECK-NEXT: vmrglh v2, v3, v2 +; CHECK-NEXT: vpkuhum v2, v2, v2 ; CHECK-NEXT: xxsldwi vs0, v2, v2, 2 ; CHECK-NEXT: stfiwx f0, 0, r3 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test4i8: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lxvw4x vs0, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -32 -; CHECK-BE-NEXT: stxvw4x vs0, 0, r4 -; CHECK-BE-NEXT: lhz r4, -26(r1) -; CHECK-BE-NEXT: stb r4, -48(r1) -; CHECK-BE-NEXT: lhz r4, -28(r1) -; CHECK-BE-NEXT: stb r4, -64(r1) -; CHECK-BE-NEXT: lhz r4, -30(r1) -; CHECK-BE-NEXT: stb r4, -80(r1) -; CHECK-BE-NEXT: lhz r4, -32(r1) -; CHECK-BE-NEXT: stb r4, -96(r1) -; CHECK-BE-NEXT: addi r4, r1, -48 ; CHECK-BE-NEXT: lxvw4x v2, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -64 -; CHECK-BE-NEXT: lxvw4x v3, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -80 -; CHECK-BE-NEXT: lxvw4x v4, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -96 -; CHECK-BE-NEXT: lxvw4x v5, 0, r4 -; CHECK-BE-NEXT: vmrghb v2, v3, v2 -; CHECK-BE-NEXT: addi r4, r1, -16 -; CHECK-BE-NEXT: vmrghb v3, v5, v4 -; CHECK-BE-NEXT: vmrghh v2, v3, v2 -; CHECK-BE-NEXT: stxvw4x v2, 0, r4 +; CHECK-BE-NEXT: addi r5, r1, -16 +; CHECK-BE-NEXT: vpkuhum v2, v2, v2 +; CHECK-BE-NEXT: stxvw4x v2, 0, r5 ; CHECK-BE-NEXT: lwz r4, -16(r1) ; CHECK-BE-NEXT: stw r4, 0(r3) ; CHECK-BE-NEXT: blr @@ -168,54 +59,23 @@ define void @test4i8w(<4 x i8>* nocapture %Sink, <4 x i32>* nocapture readonly %SrcPtr) { ; CHECK-LABEL: test4i8w: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lvx v2, 0, r4 -; CHECK-NEXT: xxswapd vs0, v2 -; CHECK-NEXT: mfvsrwz r4, v2 -; CHECK-NEXT: xxsldwi vs1, v2, v2, 1 -; CHECK-NEXT: xxsldwi vs3, v2, v2, 3 -; CHECK-NEXT: mtvsrd f2, r4 -; CHECK-NEXT: mfvsrwz r4, f0 -; CHECK-NEXT: mfvsrwz r5, f1 -; CHECK-NEXT: xxswapd v4, vs2 -; CHECK-NEXT: mtvsrd f0, r4 -; CHECK-NEXT: mfvsrwz r4, f3 -; CHECK-NEXT: mtvsrd f1, r5 -; CHECK-NEXT: xxswapd v2, vs0 -; CHECK-NEXT: mtvsrd f3, r4 -; CHECK-NEXT: xxswapd v3, vs1 -; CHECK-NEXT: xxswapd v5, vs3 -; CHECK-NEXT: vmrglb v2, v3, v2 -; CHECK-NEXT: vmrglb v3, v5, v4 -; CHECK-NEXT: vmrglh v2, v3, v2 +; CHECK-NEXT: addis r5, r2, .LCPI2_0@toc@ha +; CHECK-NEXT: lvx v3, 0, r4 +; CHECK-NEXT: addi r5, r5, .LCPI2_0@toc@l +; CHECK-NEXT: lvx v2, 0, r5 +; CHECK-NEXT: vperm v2, v3, v3, v2 ; CHECK-NEXT: xxsldwi vs0, v2, v2, 2 ; CHECK-NEXT: stfiwx f0, 0, r3 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test4i8w: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lxvw4x vs0, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -32 -; CHECK-BE-NEXT: stxvw4x vs0, 0, r4 -; CHECK-BE-NEXT: lwz r4, -20(r1) -; CHECK-BE-NEXT: stb r4, -48(r1) -; CHECK-BE-NEXT: lwz r4, -24(r1) -; CHECK-BE-NEXT: stb r4, -64(r1) -; CHECK-BE-NEXT: lwz r4, -28(r1) -; CHECK-BE-NEXT: stb r4, -80(r1) -; CHECK-BE-NEXT: lwz r4, -32(r1) -; CHECK-BE-NEXT: stb r4, -96(r1) -; CHECK-BE-NEXT: addi r4, r1, -48 +; CHECK-BE-NEXT: addis r5, r2, .LCPI2_0@toc@ha ; CHECK-BE-NEXT: lxvw4x v2, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -64 +; CHECK-BE-NEXT: addi r4, r5, .LCPI2_0@toc@l ; CHECK-BE-NEXT: lxvw4x v3, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -80 -; CHECK-BE-NEXT: lxvw4x v4, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -96 -; CHECK-BE-NEXT: lxvw4x v5, 0, r4 -; CHECK-BE-NEXT: vmrghb v2, v3, v2 ; CHECK-BE-NEXT: addi r4, r1, -16 -; CHECK-BE-NEXT: vmrghb v3, v5, v4 -; CHECK-BE-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-NEXT: vperm v2, v2, v2, v3 ; CHECK-BE-NEXT: stxvw4x v2, 0, r4 ; CHECK-BE-NEXT: lwz r4, -16(r1) ; CHECK-BE-NEXT: stw r4, 0(r3) @@ -231,15 +91,7 @@ ; CHECK-LABEL: test2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lvx v2, 0, r4 -; CHECK-NEXT: xxswapd vs0, v2 -; CHECK-NEXT: mfvsrd r4, f0 -; CHECK-NEXT: clrldi r5, r4, 48 -; CHECK-NEXT: rldicl r4, r4, 48, 48 -; CHECK-NEXT: mtvsrd f0, r5 -; CHECK-NEXT: mtvsrd f1, r4 -; CHECK-NEXT: xxswapd v2, vs0 -; CHECK-NEXT: xxswapd v3, vs1 -; CHECK-NEXT: vmrglb v2, v3, v2 +; CHECK-NEXT: vpkuhum v2, v2, v2 ; CHECK-NEXT: xxswapd vs0, v2 ; CHECK-NEXT: mfvsrd r4, f0 ; CHECK-NEXT: clrldi r4, r4, 48 @@ -248,20 +100,10 @@ ; ; CHECK-BE-LABEL: test2i8: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lxvw4x vs0, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -32 -; CHECK-BE-NEXT: stxvw4x vs0, 0, r4 -; CHECK-BE-NEXT: lhz r4, -30(r1) -; CHECK-BE-NEXT: stb r4, -48(r1) -; CHECK-BE-NEXT: lhz r4, -32(r1) -; CHECK-BE-NEXT: stb r4, -64(r1) -; CHECK-BE-NEXT: addi r4, r1, -48 ; CHECK-BE-NEXT: lxvw4x v2, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -64 -; CHECK-BE-NEXT: lxvw4x v3, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -16 -; CHECK-BE-NEXT: vmrghb v2, v3, v2 -; CHECK-BE-NEXT: stxvw4x v2, 0, r4 +; CHECK-BE-NEXT: addi r5, r1, -16 +; CHECK-BE-NEXT: vpkuhum v2, v2, v2 +; CHECK-BE-NEXT: stxvw4x v2, 0, r5 ; CHECK-BE-NEXT: lhz r4, -16(r1) ; CHECK-BE-NEXT: sth r4, 0(r3) ; CHECK-BE-NEXT: blr @@ -276,54 +118,17 @@ ; CHECK-LABEL: test4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lvx v2, 0, r4 -; CHECK-NEXT: xxswapd vs0, v2 -; CHECK-NEXT: mfvsrwz r4, v2 -; CHECK-NEXT: xxsldwi vs1, v2, v2, 1 -; CHECK-NEXT: xxsldwi vs3, v2, v2, 3 -; CHECK-NEXT: mtvsrd f2, r4 -; CHECK-NEXT: mfvsrwz r4, f0 -; CHECK-NEXT: mfvsrwz r5, f1 -; CHECK-NEXT: xxswapd v4, vs2 -; CHECK-NEXT: mtvsrd f0, r4 -; CHECK-NEXT: mfvsrwz r4, f3 -; CHECK-NEXT: mtvsrd f1, r5 -; CHECK-NEXT: xxswapd v2, vs0 -; CHECK-NEXT: mtvsrd f3, r4 -; CHECK-NEXT: xxswapd v3, vs1 -; CHECK-NEXT: xxswapd v5, vs3 -; CHECK-NEXT: vmrglh v2, v3, v2 -; CHECK-NEXT: vmrglh v3, v5, v4 -; CHECK-NEXT: vmrglw v2, v3, v2 +; CHECK-NEXT: vpkuwum v2, v2, v2 ; CHECK-NEXT: xxswapd vs0, v2 ; CHECK-NEXT: stfdx f0, 0, r3 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test4i16: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lxvw4x vs0, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -32 -; CHECK-BE-NEXT: stxvw4x vs0, 0, r4 -; CHECK-BE-NEXT: lwz r4, -20(r1) -; CHECK-BE-NEXT: sth r4, -48(r1) -; CHECK-BE-NEXT: lwz r4, -24(r1) -; CHECK-BE-NEXT: sth r4, -64(r1) -; CHECK-BE-NEXT: lwz r4, -28(r1) -; CHECK-BE-NEXT: sth r4, -80(r1) -; CHECK-BE-NEXT: lwz r4, -32(r1) -; CHECK-BE-NEXT: sth r4, -96(r1) -; CHECK-BE-NEXT: addi r4, r1, -48 ; CHECK-BE-NEXT: lxvw4x v2, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -64 -; CHECK-BE-NEXT: lxvw4x v3, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -80 -; CHECK-BE-NEXT: lxvw4x v4, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -96 -; CHECK-BE-NEXT: lxvw4x v5, 0, r4 -; CHECK-BE-NEXT: vmrghh v2, v3, v2 -; CHECK-BE-NEXT: addi r4, r1, -16 -; CHECK-BE-NEXT: vmrghh v3, v5, v4 -; CHECK-BE-NEXT: vmrghw v2, v3, v2 -; CHECK-BE-NEXT: stxvd2x v2, 0, r4 +; CHECK-BE-NEXT: addi r5, r1, -16 +; CHECK-BE-NEXT: vpkuwum v2, v2, v2 +; CHECK-BE-NEXT: stxvd2x v2, 0, r5 ; CHECK-BE-NEXT: ld r4, -16(r1) ; CHECK-BE-NEXT: std r4, 0(r3) ; CHECK-BE-NEXT: blr @@ -338,35 +143,17 @@ ; CHECK-LABEL: test2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lvx v2, 0, r4 -; CHECK-NEXT: xxswapd vs0, v2 -; CHECK-NEXT: xxsldwi vs1, v2, v2, 1 -; CHECK-NEXT: mfvsrwz r4, f0 -; CHECK-NEXT: mfvsrwz r5, f1 -; CHECK-NEXT: mtvsrd f0, r4 -; CHECK-NEXT: mtvsrd f1, r5 -; CHECK-NEXT: xxswapd v2, vs0 -; CHECK-NEXT: xxswapd v3, vs1 -; CHECK-NEXT: vmrglh v2, v3, v2 +; CHECK-NEXT: vpkuwum v2, v2, v2 ; CHECK-NEXT: xxsldwi vs0, v2, v2, 2 ; CHECK-NEXT: stfiwx f0, 0, r3 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test2i16: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lxvw4x vs0, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -32 -; CHECK-BE-NEXT: stxvw4x vs0, 0, r4 -; CHECK-BE-NEXT: lwz r4, -28(r1) -; CHECK-BE-NEXT: sth r4, -48(r1) -; CHECK-BE-NEXT: lwz r4, -32(r1) -; CHECK-BE-NEXT: sth r4, -64(r1) -; CHECK-BE-NEXT: addi r4, r1, -48 ; CHECK-BE-NEXT: lxvw4x v2, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -64 -; CHECK-BE-NEXT: lxvw4x v3, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -16 -; CHECK-BE-NEXT: vmrghh v2, v3, v2 -; CHECK-BE-NEXT: stxvw4x v2, 0, r4 +; CHECK-BE-NEXT: addi r5, r1, -16 +; CHECK-BE-NEXT: vpkuwum v2, v2, v2 +; CHECK-BE-NEXT: stxvw4x v2, 0, r5 ; CHECK-BE-NEXT: lwz r4, -16(r1) ; CHECK-BE-NEXT: stw r4, 0(r3) ; CHECK-BE-NEXT: blr @@ -381,33 +168,23 @@ ; CHECK-LABEL: test2i16d: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lxvd2x vs0, 0, r4 -; CHECK-NEXT: xxswapd vs1, vs0 -; CHECK-NEXT: mfvsrwz r4, f0 -; CHECK-NEXT: mtvsrd f0, r4 -; CHECK-NEXT: mfvsrwz r5, f1 +; CHECK-NEXT: addis r5, r2, .LCPI6_0@toc@ha +; CHECK-NEXT: addi r4, r5, .LCPI6_0@toc@l +; CHECK-NEXT: lvx v3, 0, r4 ; CHECK-NEXT: xxswapd v2, vs0 -; CHECK-NEXT: mtvsrd f1, r5 -; CHECK-NEXT: xxswapd v3, vs1 -; CHECK-NEXT: vmrglh v2, v3, v2 +; CHECK-NEXT: vperm v2, v2, v2, v3 ; CHECK-NEXT: xxsldwi vs0, v2, v2, 2 ; CHECK-NEXT: stfiwx f0, 0, r3 ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test2i16d: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lxvd2x vs0, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -32 -; CHECK-BE-NEXT: stxvd2x vs0, 0, r4 -; CHECK-BE-NEXT: lwz r4, -20(r1) -; CHECK-BE-NEXT: sth r4, -48(r1) -; CHECK-BE-NEXT: lwz r4, -28(r1) -; CHECK-BE-NEXT: sth r4, -64(r1) -; CHECK-BE-NEXT: addi r4, r1, -48 +; CHECK-BE-NEXT: addis r5, r2, .LCPI6_0@toc@ha ; CHECK-BE-NEXT: lxvw4x v2, 0, r4 -; CHECK-BE-NEXT: addi r4, r1, -64 +; CHECK-BE-NEXT: addi r4, r5, .LCPI6_0@toc@l ; CHECK-BE-NEXT: lxvw4x v3, 0, r4 ; CHECK-BE-NEXT: addi r4, r1, -16 -; CHECK-BE-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-NEXT: vperm v2, v2, v2, v3 ; CHECK-BE-NEXT: stxvw4x v2, 0, r4 ; CHECK-BE-NEXT: lwz r4, -16(r1) ; CHECK-BE-NEXT: stw r4, 0(r3)