Index: lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- lib/Target/PowerPC/PPCISelLowering.h +++ lib/Target/PowerPC/PPCISelLowering.h @@ -82,15 +82,15 @@ /// XXSPLT, - /// XXINSERT - The PPC VSX insert instruction + /// VECINSERT - The PPC vector insert instruction /// - XXINSERT, + VECINSERT, /// XXREVERSE - The PPC VSX reverse instruction /// XXREVERSE, - /// VECSHL - The PPC VSX shift left instruction + /// VECSHL - The PPC vector shift left instruction /// VECSHL, @@ -504,6 +504,16 @@ bool isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, unsigned &InsertAtByte, bool &Swap, bool IsLE); + /// isVINSERTHMask - Return true if this VECTOR_SHUFFLE can be handled by + /// the VINSERTH instruction introduced in ISA 3.0. This is essentially any + /// shuffle of v8i16 vectors that just inserts one element from one + /// vector into the other. This function will also set a couple of + /// output parameters for how much the source vector needs to be shifted and + /// what byte number needs to be specified for the instruction to put the + /// element in the desired location of the target vector. + bool isVINSERTHMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, + unsigned &InsertAtByte, bool &Swap, bool IsLE); + /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the /// specified isSplatShuffleMask VECTOR_SHUFFLE mask. unsigned getVSPLTImmediate(SDNode *N, unsigned EltSize, SelectionDAG &DAG); Index: lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCISelLowering.cpp +++ lib/Target/PowerPC/PPCISelLowering.cpp @@ -114,6 +114,8 @@ STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumSiblingCalls, "Number of sibling calls"); +static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int); + // FIXME: Remove this once the bug has been fixed! extern cl::opt ANDIGlueBug; @@ -1127,7 +1129,7 @@ case PPCISD::VNMSUBFP: return "PPCISD::VNMSUBFP"; case PPCISD::VPERM: return "PPCISD::VPERM"; case PPCISD::XXSPLT: return "PPCISD::XXSPLT"; - case PPCISD::XXINSERT: return "PPCISD::XXINSERT"; + case PPCISD::VECINSERT: return "PPCISD::VECINSERT"; case PPCISD::XXREVERSE: return "PPCISD::XXREVERSE"; case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI"; case PPCISD::VECSHL: return "PPCISD::VECSHL"; @@ -1611,6 +1613,88 @@ return true; } +bool PPC::isVINSERTHMask(ShuffleVectorSDNode *N, unsigned &ShiftElts, + unsigned &InsertAtByte, bool &Swap, bool IsLE) { + const unsigned NumHalfWords = 8; + const unsigned BytesInVector = NumHalfWords * 2; + // check that the shuffle is on half-words + if (!isNByteElemShuffleMask(N, 2, 1)) + return false; + + // shifts required to get the half-word we want at element 3 + unsigned LittleEndianShifts[] = {4, 3, 2, 1, 0, 7, 6, 5}; + unsigned BigEndianShifts[] = {5, 6, 7, 0, 1, 2, 3, 4}; + + uint32_t Mask = 0; + uint32_t OriginalOrderLow = 0x1234567; + uint32_t OriginalOrderHigh = 0x89ABCDEF; + // Now we look at mask elements 0,2,4,6,8,10,12,14 + // Pack the mask into a 32-bit space, only need 4-bit nibbles per element + for (unsigned i = 0; i < NumHalfWords; ++i) { + unsigned MaskShift = (NumHalfWords - 1 - i) * 4; + Mask |= ((uint32_t)(N->getMaskElt(i * 2) / 2) << MaskShift); + } + + // For each mask element, find out if we're just inserting something + // from V2 into V1 or vice versa. + // Possible permutations inserting an element from V2 into V1: + // X, 1, 2, 3, 4, 5, 6, 7 + // 0, X, 2, 3, 4, 5, 6, 7 + // 0, 1, X, 3, 4, 5, 6, 7 + // 0, 1, 2, X, 4, 5, 6, 7 + // 0, 1, 2, 3, X, 5, 6, 7 + // 0, 1, 2, 3, 4, X, 6, 7 + // 0, 1, 2, 3, 4, 5, X, 7 + // 0, 1, 2, 3, 4, 5, 6, X + // Inserting from V1 into V2 will be similar, except mask range will be [8,15] + + // Go through the mask of half-words to find an element that's being moved + // from one vector to the other + for (unsigned i = 0; i < NumHalfWords; ++i) { + unsigned MaskShift = (NumHalfWords - 1 - i) * 4; + uint32_t MaskOneElt = (Mask >> MaskShift) & 0xF; + uint32_t MaskOtherElts = ~(0xF << MaskShift); + // target order is [8,15] if the current mask is between [0,7] + uint32_t TargetOrder = + (MaskOneElt < NumHalfWords) ? OriginalOrderHigh : OriginalOrderLow; + // skip if mask of other elements don't equal our expected order + if ((Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) { + // we only need the last 3 bits for the number of shifts + ShiftElts = IsLE ? LittleEndianShifts[MaskOneElt & 0x7] + : BigEndianShifts[MaskOneElt & 0x7]; + InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2; + Swap = MaskOneElt < NumHalfWords; + return true; + } + } + + // If both vector operands for the shuffle are the same vector, the mask will + // contain only elements from the first one and the second one will be undef. + if (N->getOperand(1).isUndef()) { + ShiftElts = 0; + Swap = true; + unsigned VINSERTHSrcElem = IsLE ? 4 : 3; + + // go through the mask of half-words to find element 3 + for (unsigned i = 0; i < NumHalfWords; ++i) { + // shift needs to be multiplied by 4 (for a nibble) + unsigned MaskShift = (NumHalfWords - 1 - i) * 4; + uint32_t MaskOneElt = (Mask >> MaskShift) & 0xF; + uint32_t MaskOtherElts = ~(0xF << MaskShift); + uint32_t TargetOrder = OriginalOrderLow; + // skip if not the correct element or mask of other elements don't equal + // to our expected order + if (MaskOneElt == VINSERTHSrcElem && + (Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) { + InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2; + return true; + } + } + } + + return false; +} + /// Check that the mask is shuffling N byte elements. Within each N byte /// element of the mask, the indices could be either in increasing or /// decreasing order as long as they are consecutive. @@ -7865,15 +7949,37 @@ if (ShiftElts) { SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv2, Conv2, DAG.getConstant(ShiftElts, dl, MVT::i32)); - SDValue Ins = DAG.getNode(PPCISD::XXINSERT, dl, MVT::v4i32, Conv1, Shl, + SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Shl, DAG.getConstant(InsertAtByte, dl, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); } - SDValue Ins = DAG.getNode(PPCISD::XXINSERT, dl, MVT::v4i32, Conv1, Conv2, + SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Conv2, DAG.getConstant(InsertAtByte, dl, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); } + if (Subtarget.hasP9Altivec()) { + if (PPC::isVINSERTHMask(SVOp, ShiftElts, InsertAtByte, Swap, + isLittleEndian)) { + if (Swap) + std::swap(V1, V2); + SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); + if (ShiftElts) { + // Double ShiftElts because we're left shifting on v16i8 type + SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2, + DAG.getConstant(2 * ShiftElts, dl, MVT::i32)); + SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, Shl); + SDValue Ins = + DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2, + DAG.getConstant(InsertAtByte, dl, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); + } + SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); + SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2, + DAG.getConstant(InsertAtByte, dl, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); + } + } if (Subtarget.hasVSX() && PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) { Index: lib/Target/PowerPC/PPCInstrAltivec.td =================================================================== --- lib/Target/PowerPC/PPCInstrAltivec.td +++ lib/Target/PowerPC/PPCInstrAltivec.td @@ -477,10 +477,10 @@ def VSEL : VA1a_Int_Ty<42, "vsel", int_ppc_altivec_vsel, v4i32>; // Shuffles. -def VSLDOI : VAForm_2<44, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u5imm:$SH), +def VSLDOI : VAForm_2<44, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB, u4imm:$SH), "vsldoi $vD, $vA, $vB, $SH", IIC_VecFP, - [(set v16i8:$vD, - (vsldoi_shuffle:$SH v16i8:$vA, v16i8:$vB))]>; + [(set v16i8:$vD, + (PPCvecshl v16i8:$vA, v16i8:$vB, imm32SExt16:$SH))]>; // VX-Form instructions. AltiVec arithmetic ops. let isCommutable = 1 in { @@ -908,6 +908,9 @@ (VPKUWUM $vA, $vA)>; def:Pat<(vpkuhum_unary_shuffle v16i8:$vA, undef), (VPKUHUM $vA, $vA)>; +def:Pat<(vsldoi_shuffle:$SH v16i8:$vA, v16i8:$vB), + (VSLDOI v16i8:$vA, v16i8:$vB, (VSLDOI_get_imm $SH))>; + // Match vsldoi(y,x), vpkuwum(y,x), vpkuhum(y,x), i.e., swapped operands. // These fragments are matched for little-endian, where the inputs must @@ -1310,7 +1313,12 @@ // Vector Insert Element Instructions def VINSERTB : VX1_VT5_UIM5_VB5<781, "vinsertb", []>; -def VINSERTH : VX1_VT5_UIM5_VB5<845, "vinserth", []>; +def VINSERTH : VXForm_1<845, (outs vrrc:$vD), + (ins vrrc:$vDi, u4imm:$UIM, vrrc:$vB), + "vinserth $vD, $vB, $UIM", IIC_VecGeneral, + [(set v8i16:$vD, (PPCvecinsert v8i16:$vDi, v8i16:$vB, + imm32SExt16:$UIM))]>, + RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">; def VINSERTW : VX1_VT5_UIM5_VB5<909, "vinsertw", []>; def VINSERTD : VX1_VT5_UIM5_VB5<973, "vinsertd", []>; Index: lib/Target/PowerPC/PPCInstrInfo.td =================================================================== --- lib/Target/PowerPC/PPCInstrInfo.td +++ lib/Target/PowerPC/PPCInstrInfo.td @@ -177,7 +177,7 @@ def PPCvperm : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>; def PPCxxsplt : SDNode<"PPCISD::XXSPLT", SDT_PPCVecSplat, []>; -def PPCxxinsert : SDNode<"PPCISD::XXINSERT", SDT_PPCVecInsert, []>; +def PPCvecinsert : SDNode<"PPCISD::VECINSERT", SDT_PPCVecInsert, []>; def PPCxxreverse : SDNode<"PPCISD::XXREVERSE", SDT_PPCVecReverse, []>; def PPCxxpermdi : SDNode<"PPCISD::XXPERMDI", SDT_PPCxxpermdi, []>; def PPCvecshl : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>; Index: lib/Target/PowerPC/PPCInstrVSX.td =================================================================== --- lib/Target/PowerPC/PPCInstrVSX.td +++ lib/Target/PowerPC/PPCInstrVSX.td @@ -2252,7 +2252,7 @@ XX2_RD6_UIM5_RS6<60, 181, (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XB, u4imm:$UIM), "xxinsertw $XT, $XB, $UIM", IIC_VecFP, - [(set v4i32:$XT, (PPCxxinsert v4i32:$XTi, v4i32:$XB, + [(set v4i32:$XT, (PPCvecinsert v4i32:$XTi, v4i32:$XB, imm32SExt16:$UIM))]>, RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">; Index: test/CodeGen/PowerPC/p9-vinsert-vextract.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/p9-vinsert-vextract.ll @@ -0,0 +1,289 @@ +; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mcpu=pwr9 -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-BE + +define <8 x i16> @_Z27shuffle_vector_halfword_0_8Dv8_sS_(<8 x i16> %a, <8 x i16> %b) { +entry: +; CHECK-LABEL: _Z27shuffle_vector_halfword_0_8Dv8_sS_ +; CHECK: vsldoi 3, 3, 3, 8 +; CHECK: vinserth 2, 3, 14 +; CHECK-BE-LABEL: _Z27shuffle_vector_halfword_0_8Dv8_sS_ +; CHECK-BE: vsldoi 3, 3, 3, 10 +; CHECK-BE: vinserth 2, 3, 0 + %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + ret <8 x i16> %vecins +} + +define <8 x i16> @_Z28shuffle_vector_halfword_1_15Dv8_sS_(<8 x i16> %a, <8 x i16> %b) { +entry: +; CHECK-LABEL: _Z28shuffle_vector_halfword_1_15Dv8_sS_ +; CHECK: vsldoi 3, 3, 3, 10 +; CHECK: vinserth 2, 3, 12 +; CHECK-BE-LABEL: _Z28shuffle_vector_halfword_1_15Dv8_sS_ +; CHECK-BE: vsldoi 3, 3, 3, 8 +; CHECK-BE: vinserth 2, 3, 2 + %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + ret <8 x i16> %vecins +} + +define <8 x i16> @_Z27shuffle_vector_halfword_2_9Dv8_sS_(<8 x i16> %a, <8 x i16> %b) { +entry: +; CHECK-LABEL: _Z27shuffle_vector_halfword_2_9Dv8_sS_ +; CHECK: vsldoi 3, 3, 3, 6 +; CHECK: vinserth 2, 3, 10 +; CHECK-BE-LABEL: _Z27shuffle_vector_halfword_2_9Dv8_sS_ +; CHECK-BE: vsldoi 3, 3, 3, 12 +; CHECK-BE: vinserth 2, 3, 4 + %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + ret <8 x i16> %vecins +} + +define <8 x i16> @_Z28shuffle_vector_halfword_3_13Dv8_sS_(<8 x i16> %a, <8 x i16> %b) { +entry: +; CHECK-LABEL: _Z28shuffle_vector_halfword_3_13Dv8_sS_ +; CHECK: vsldoi 3, 3, 3, 14 +; CHECK: vinserth 2, 3, 8 +; CHECK-BE-LABEL: _Z28shuffle_vector_halfword_3_13Dv8_sS_ +; CHECK-BE: vsldoi 3, 3, 3, 4 +; CHECK-BE: vinserth 2, 3, 6 + %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + ret <8 x i16> %vecins +} + +define <8 x i16> @_Z28shuffle_vector_halfword_4_10Dv8_sS_(<8 x i16> %a, <8 x i16> %b) { +entry: +; CHECK-LABEL: _Z28shuffle_vector_halfword_4_10Dv8_sS_ +; CHECK: vsldoi 3, 3, 3, 4 +; CHECK: vinserth 2, 3, 6 +; CHECK-BE-LABEL: _Z28shuffle_vector_halfword_4_10Dv8_sS_ +; CHECK-BE: vsldoi 3, 3, 3, 14 +; CHECK-BE: vinserth 2, 3, 8 + %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + ret <8 x i16> %vecins +} + +define <8 x i16> @_Z28shuffle_vector_halfword_5_14Dv8_sS_(<8 x i16> %a, <8 x i16> %b) { +entry: +; CHECK-LABEL: _Z28shuffle_vector_halfword_5_14Dv8_sS_ +; CHECK: vsldoi 3, 3, 3, 12 +; CHECK: vinserth 2, 3, 4 +; CHECK-BE-LABEL: _Z28shuffle_vector_halfword_5_14Dv8_sS_ +; CHECK-BE: vsldoi 3, 3, 3, 6 +; CHECK-BE: vinserth 2, 3, 10 + %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + ret <8 x i16> %vecins +} + +define <8 x i16> @_Z28shuffle_vector_halfword_6_11Dv8_sS_(<8 x i16> %a, <8 x i16> %b) { +entry: +; CHECK-LABEL: _Z28shuffle_vector_halfword_6_11Dv8_sS_ +; CHECK: vsldoi 3, 3, 3, 2 +; CHECK: vinserth 2, 3, 2 +; CHECK-BE-LABEL: _Z28shuffle_vector_halfword_6_11Dv8_sS_ +; CHECK-BE: vinserth 2, 3, 12 + %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + ret <8 x i16> %vecins +} + +define <8 x i16> @_Z28shuffle_vector_halfword_7_12Dv8_sS_(<8 x i16> %a, <8 x i16> %b) { +entry: +; CHECK-LABEL: _Z28shuffle_vector_halfword_7_12Dv8_sS_ +; CHECK: vinserth 2, 3, 0 +; CHECK-BE-LABEL: _Z28shuffle_vector_halfword_7_12Dv8_sS_ +; CHECK-BE: vsldoi 3, 3, 3, 2 +; CHECK-BE: vinserth 2, 3, 14 + %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + ret <8 x i16> %vecins +} + +define <8 x i16> @_Z27shuffle_vector_halfword_8_1Dv8_sS_(<8 x i16> %a, <8 x i16> %b) { +entry: +; CHECK-LABEL: _Z27shuffle_vector_halfword_8_1Dv8_sS_ +; CHECK: vsldoi 2, 2, 2, 6 +; CHECK: vinserth 3, 2, 14 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: _Z27shuffle_vector_halfword_8_1Dv8_sS_ +; CHECK-BE: vsldoi 2, 2, 2, 12 +; CHECK-BE: vinserth 3, 2, 0 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + ret <8 x i16> %vecins +} + +define <8 x i16> @_Z27shuffle_vector_halfword_9_7Dv8_sS_(<8 x i16> %a, <8 x i16> %b) { +entry: +; CHECK-LABEL: _Z27shuffle_vector_halfword_9_7Dv8_sS_ +; CHECK: vsldoi 2, 2, 2, 10 +; CHECK: vinserth 3, 2, 12 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: _Z27shuffle_vector_halfword_9_7Dv8_sS_ +; CHECK-BE: vsldoi 2, 2, 2, 8 +; CHECK-BE: vinserth 3, 2, 2 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + ret <8 x i16> %vecins +} + +define <8 x i16> @_Z28shuffle_vector_halfword_10_4Dv8_sS_(<8 x i16> %a, <8 x i16> %b) { +entry: +; CHECK-LABEL: _Z28shuffle_vector_halfword_10_4Dv8_sS_ +; CHECK: vinserth 3, 2, 10 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: _Z28shuffle_vector_halfword_10_4Dv8_sS_ +; CHECK-BE: vsldoi 2, 2, 2, 2 +; CHECK-BE: vinserth 3, 2, 4 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + ret <8 x i16> %vecins +} + +define <8 x i16> @_Z28shuffle_vector_halfword_11_2Dv8_sS_(<8 x i16> %a, <8 x i16> %b) { +entry: +; CHECK-LABEL: _Z28shuffle_vector_halfword_11_2Dv8_sS_ +; CHECK: vsldoi 2, 2, 2, 4 +; CHECK: vinserth 3, 2, 8 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: _Z28shuffle_vector_halfword_11_2Dv8_sS_ +; CHECK-BE: vsldoi 2, 2, 2, 14 +; CHECK-BE: vinserth 3, 2, 6 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + ret <8 x i16> %vecins +} + +define <8 x i16> @_Z28shuffle_vector_halfword_12_6Dv8_sS_(<8 x i16> %a, <8 x i16> %b) { +entry: +; CHECK-LABEL: _Z28shuffle_vector_halfword_12_6Dv8_sS_ +; CHECK: vsldoi 2, 2, 2, 12 +; CHECK: vinserth 3, 2, 6 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: _Z28shuffle_vector_halfword_12_6Dv8_sS_ +; CHECK-BE: vsldoi 2, 2, 2, 6 +; CHECK-BE: vinserth 3, 2, 8 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + ret <8 x i16> %vecins +} + +define <8 x i16> @_Z28shuffle_vector_halfword_13_3Dv8_sS_(<8 x i16> %a, <8 x i16> %b) { +entry: +; CHECK-LABEL: _Z28shuffle_vector_halfword_13_3Dv8_sS_ +; CHECK: vsldoi 2, 2, 2, 2 +; CHECK: vinserth 3, 2, 4 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: _Z28shuffle_vector_halfword_13_3Dv8_sS_ +; CHECK-BE: vinserth 3, 2, 10 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + ret <8 x i16> %vecins +} + +define <8 x i16> @_Z28shuffle_vector_halfword_14_5Dv8_sS_(<8 x i16> %a, <8 x i16> %b) { +entry: +; CHECK-LABEL: _Z28shuffle_vector_halfword_14_5Dv8_sS_ +; CHECK: vsldoi 2, 2, 2, 14 +; CHECK: vinserth 3, 2, 2 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: _Z28shuffle_vector_halfword_14_5Dv8_sS_ +; CHECK-BE: vsldoi 2, 2, 2, 4 +; CHECK-BE: vinserth 3, 2, 12 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + ret <8 x i16> %vecins +} + +define <8 x i16> @_Z28shuffle_vector_halfword_15_0Dv8_sS_(<8 x i16> %a, <8 x i16> %b) { +entry: +; CHECK-LABEL: _Z28shuffle_vector_halfword_15_0Dv8_sS_ +; CHECK: vsldoi 2, 2, 2, 8 +; CHECK: vinserth 3, 2, 0 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: _Z28shuffle_vector_halfword_15_0Dv8_sS_ +; CHECK-BE: vsldoi 2, 2, 2, 10 +; CHECK-BE: vinserth 3, 2, 14 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> + ret <8 x i16> %vecins +} + +define <8 x i16> @_Z27shuffle_vector_halfword_0_4Dv8_s(<8 x i16> %a) { +entry: +; CHECK-LABEL: _Z27shuffle_vector_halfword_0_4Dv8_s +; CHECK: vinserth 2, 2, 14 +; CHECK-BE-LABEL: _Z27shuffle_vector_halfword_0_4Dv8_s +; CHECK-BE-NOT: vinserth + %vecins = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> + ret <8 x i16> %vecins +} + +define <8 x i16> @_Z27shuffle_vector_halfword_1_3Dv8_s(<8 x i16> %a) { +entry: +; CHECK-LABEL: _Z27shuffle_vector_halfword_1_3Dv8_s +; CHECK-NOT: vinserth +; CHECK-BE-LABEL: _Z27shuffle_vector_halfword_1_3Dv8_s +; CHECK-BE: vinserth 2, 2, 2 + %vecins = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> + ret <8 x i16> %vecins +} + +define <8 x i16> @_Z27shuffle_vector_halfword_2_3Dv8_s(<8 x i16> %a) { +entry: +; CHECK-LABEL: _Z27shuffle_vector_halfword_2_3Dv8_s +; CHECK-NOT: vinserth +; CHECK-BE-LABEL: _Z27shuffle_vector_halfword_2_3Dv8_s +; CHECK-BE: vinserth 2, 2, 4 + %vecins = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> + ret <8 x i16> %vecins +} + +define <8 x i16> @_Z27shuffle_vector_halfword_3_4Dv8_s(<8 x i16> %a) { +entry: +; CHECK-LABEL: _Z27shuffle_vector_halfword_3_4Dv8_s +; CHECK: vinserth 2, 2, 8 +; CHECK-BE-LABEL: _Z27shuffle_vector_halfword_3_4Dv8_s +; CHECK-BE-NOT: vinserth + %vecins = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> + ret <8 x i16> %vecins +} + +define <8 x i16> @_Z27shuffle_vector_halfword_4_3Dv8_s(<8 x i16> %a) { +entry: +; CHECK-LABEL: _Z27shuffle_vector_halfword_4_3Dv8_s +; CHECK-NOT: vinserth +; CHECK-BE-LABEL: _Z27shuffle_vector_halfword_4_3Dv8_s +; CHECK-BE: vinserth 2, 2, 8 + %vecins = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> + ret <8 x i16> %vecins +} + +define <8 x i16> @_Z27shuffle_vector_halfword_5_3Dv8_s(<8 x i16> %a) { +entry: +; CHECK-LABEL: _Z27shuffle_vector_halfword_5_3Dv8_s +; CHECK-NOT: vinserth +; CHECK-BE-LABEL: _Z27shuffle_vector_halfword_5_3Dv8_s +; CHECK-BE: vinserth 2, 2, 10 + %vecins = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> + ret <8 x i16> %vecins +} + +define <8 x i16> @_Z27shuffle_vector_halfword_6_4Dv8_s(<8 x i16> %a) { +entry: +; CHECK-LABEL: _Z27shuffle_vector_halfword_6_4Dv8_s +; CHECK: vinserth 2, 2, 2 +; CHECK-BE-LABEL: _Z27shuffle_vector_halfword_6_4Dv8_s +; CHECK-BE-NOT: vinserth + %vecins = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> + ret <8 x i16> %vecins +} + +define <8 x i16> @_Z27shuffle_vector_halfword_7_4Dv8_s(<8 x i16> %a) { +entry: +; CHECK-LABEL: _Z27shuffle_vector_halfword_7_4Dv8_s +; CHECK: vinserth 2, 2, 0 +; CHECK-BE-LABEL: _Z27shuffle_vector_halfword_7_4Dv8_s +; CHECK-BE-NOT: vinserth + %vecins = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> + ret <8 x i16> %vecins +} +