Index: llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h +++ llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h @@ -67,6 +67,10 @@ /// VSFRC that is sign-extended from ByteWidth to a 64-byte integer. VEXTS, + /// SExtVElems, takes an input vector of a smaller type and sign + /// extends to an output vector of a larger type. + SExtVElems, + /// Reciprocal estimate instructions (unary FP ops). FRE, FRSQRTE, Index: llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp +++ llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1168,6 +1168,7 @@ case PPCISD::LXSIZX: return "PPCISD::LXSIZX"; case PPCISD::STXSIX: return "PPCISD::STXSIX"; case PPCISD::VEXTS: return "PPCISD::VEXTS"; + case PPCISD::SExtVElems: return "PPCISD::SExtVElems"; case PPCISD::LXVD2X: return "PPCISD::LXVD2X"; case PPCISD::STXVD2X: return "PPCISD::STXVD2X"; case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH"; @@ -11311,6 +11312,132 @@ return SDValue(); } +// This function adds the required vector_shuffle needed to get +// the elements of the vector extract in the correct position +// as specified by the CorrectElems encoding. +static SDValue addShuffleForVecExtend(SDNode *N, SelectionDAG &DAG, + SDValue Input, uint64_t Elems, + uint64_t CorrectElems) { + SDLoc dl(N); + + unsigned NumElems = Input.getValueType().getVectorNumElements(); + SmallVector ShuffleMask(NumElems, -1); + + // Knowing the element indices being extracted from the original + // vector and the order in which they're being inserted, just put + // them at element indices required for the instruction. + for (unsigned i = 0; i < N->getNumOperands(); i++) { + if (DAG.getDataLayout().isLittleEndian()) + ShuffleMask[CorrectElems & 0xF] = Elems & 0xF; + else + ShuffleMask[(CorrectElems & 0xF0) >> 4] = (Elems & 0xF0) >> 4; + CorrectElems = CorrectElems >> 8; + Elems = Elems >> 8; + } + + SDValue Shuffle = + DAG.getVectorShuffle(Input.getValueType(), dl, Input, + DAG.getUNDEF(Input.getValueType()), ShuffleMask); + + EVT Ty = N->getValueType(0); + SDValue BV = DAG.getNode(PPCISD::SExtVElems, dl, Ty, Shuffle); + return BV; +} + +// Look for build vector patterns where input operands come from sign +// extended vector_extract elements of specific indices. If the correct indices +// aren't used, add a vector shuffle to fix up the indices and create a new +// PPCISD:SExtVElems node which selects the vector sign extend instructions +// during instruction selection. +static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) { + // This array encodes the indices that the vector sign extend instructions + // extract from when extending from one type to another for both BE and LE. + // The right nibble of each byte corresponds to the LE incides. + // and the left nibble of each byte corresponds to the BE incides. + // For example: 0x3074B8FC byte->word + // For LE: the allowed indices are: 0x0,0x4,0x8,0xC + // For BE: the allowed indices are: 0x3,0x7,0xB,0xF + // For example: 0x000070F8 byte->double word + // For LE: the allowed indices are: 0x0,0x8 + // For BE: the allowed indices are: 0x7,0xF + uint64_t TargetElems[] = { + 0x3074B8FC, // b->w + 0x000070F8, // b->d + 0x10325476, // h->w + 0x00003074, // h->d + 0x00001032, // w->d + }; + + uint64_t Elems = 0; + int Index; + SDValue Input; + + auto isSExtOfVecExtract = [&](SDValue Op) -> bool { + if (!Op) + return false; + if (Op.getOpcode() != ISD::SIGN_EXTEND) + return false; + + SDValue Extract = Op.getOperand(0); + if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return false; + + ConstantSDNode *ExtOp = dyn_cast(Extract.getOperand(1)); + if (!ExtOp) + return false; + + Index = ExtOp->getZExtValue(); + if (Input && Input != Extract.getOperand(0)) + return false; + + if (!Input) + Input = Extract.getOperand(0); + + Elems = Elems << 8; + Index = DAG.getDataLayout().isLittleEndian() ? Index : Index << 4; + Elems |= Index; + + return true; + }; + + // If the build vector operands aren't sign extended vector extracts, + // of the same input vector, then return. + for (unsigned i = 0; i < N->getNumOperands(); i++) { + if (!isSExtOfVecExtract(N->getOperand(i))) { + return SDValue(); + } + } + + // If the vector extract indicies are not correct, add the appropriate + // vector_shuffle. + int TgtElemArrayIdx; + int InputSize = Input.getValueType().getScalarSizeInBits(); + int OutputSize = N->getValueType(0).getScalarSizeInBits(); + if (InputSize + OutputSize == 40) + TgtElemArrayIdx = 0; + else if (InputSize + OutputSize == 72) + TgtElemArrayIdx = 1; + else if (InputSize + OutputSize == 48) + TgtElemArrayIdx = 2; + else if (InputSize + OutputSize == 80) + TgtElemArrayIdx = 3; + else if (InputSize + OutputSize == 96) + TgtElemArrayIdx = 4; + else + return SDValue(); + + uint64_t CorrectElems = TargetElems[TgtElemArrayIdx]; + CorrectElems = DAG.getDataLayout().isLittleEndian() + ? CorrectElems & 0x0F0F0F0F0F0F0F0F + : CorrectElems & 0xF0F0F0F0F0F0F0F0; + if (Elems != CorrectElems) { + return addShuffleForVecExtend(N, DAG, Input, Elems, CorrectElems); + } + + // Regular lowering will catch cases where a shuffle is not needed. + return SDValue(); +} + SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N, DAGCombinerInfo &DCI) const { assert(N->getOpcode() == ISD::BUILD_VECTOR && @@ -11338,6 +11465,15 @@ if (Reduced) return Reduced; + // If we're building a vector out of extended elements from another vector + // we have P9 vector integer extend instructions. + if (Subtarget.hasP9Altivec()) { + Reduced = combineBVOfVecSExt(N, DAG); + if (Reduced) + return Reduced; + } + + if (N->getValueType(0) != MVT::v2f64) return SDValue(); Index: llvm/trunk/lib/Target/PowerPC/PPCInstrInfo.td =================================================================== --- llvm/trunk/lib/Target/PowerPC/PPCInstrInfo.td +++ llvm/trunk/lib/Target/PowerPC/PPCInstrInfo.td @@ -32,6 +32,9 @@ def SDT_PPCVexts : SDTypeProfile<1, 2, [ SDTCisVT<0, f64>, SDTCisVT<1, f64>, SDTCisPtrTy<2> ]>; +def SDT_PPCSExtVElems : SDTypeProfile<1, 1, [ + SDTCisVec<0>, SDTCisVec<1> +]>; def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>; @@ -131,6 +134,7 @@ def PPCstxsix : SDNode<"PPCISD::STXSIX", SDT_PPCstxsix, [SDNPHasChain, SDNPMayStore]>; def PPCVexts : SDNode<"PPCISD::VEXTS", SDT_PPCVexts, []>; +def PPCSExtVElems : SDNode<"PPCISD::SExtVElems", SDT_PPCSExtVElems, []>; // Extract FPSCR (not modeled at the DAG level). def PPCmffs : SDNode<"PPCISD::MFFS", Index: llvm/trunk/lib/Target/PowerPC/PPCInstrVSX.td =================================================================== --- llvm/trunk/lib/Target/PowerPC/PPCInstrVSX.td +++ llvm/trunk/lib/Target/PowerPC/PPCInstrVSX.td @@ -2729,36 +2729,54 @@ } def ByteToWord { - dag A0 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 0)), i8)); - dag A1 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 4)), i8)); - dag A2 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 8)), i8)); - dag A3 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 12)), i8)); + dag LE_A0 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 0)), i8)); + dag LE_A1 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 4)), i8)); + dag LE_A2 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 8)), i8)); + dag LE_A3 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 12)), i8)); + dag BE_A0 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 3)), i8)); + dag BE_A1 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 7)), i8)); + dag BE_A2 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 11)), i8)); + dag BE_A3 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 15)), i8)); } def ByteToDWord { - dag A0 = (i64 (sext_inreg - (i64 (anyext (i32 (vector_extract v16i8:$A, 0)))), i8)); - dag A1 = (i64 (sext_inreg - (i64 (anyext (i32 (vector_extract v16i8:$A, 8)))), i8)); + dag LE_A0 = (i64 (sext_inreg + (i64 (anyext (i32 (vector_extract v16i8:$A, 0)))), i8)); + dag LE_A1 = (i64 (sext_inreg + (i64 (anyext (i32 (vector_extract v16i8:$A, 8)))), i8)); + dag BE_A0 = (i64 (sext_inreg + (i64 (anyext (i32 (vector_extract v16i8:$A, 7)))), i8)); + dag BE_A1 = (i64 (sext_inreg + (i64 (anyext (i32 (vector_extract v16i8:$A, 15)))), i8)); } def HWordToWord { - dag A0 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 0)), i16)); - dag A1 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 2)), i16)); - dag A2 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 4)), i16)); - dag A3 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 6)), i16)); + dag LE_A0 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 0)), i16)); + dag LE_A1 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 2)), i16)); + dag LE_A2 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 4)), i16)); + dag LE_A3 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 6)), i16)); + dag BE_A0 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 1)), i16)); + dag BE_A1 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 3)), i16)); + dag BE_A2 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 5)), i16)); + dag BE_A3 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 7)), i16)); } def HWordToDWord { - dag A0 = (i64 (sext_inreg - (i64 (anyext (i32 (vector_extract v8i16:$A, 0)))), i16)); - dag A1 = (i64 (sext_inreg - (i64 (anyext (i32 (vector_extract v8i16:$A, 4)))), i16)); + dag LE_A0 = (i64 (sext_inreg + (i64 (anyext (i32 (vector_extract v8i16:$A, 0)))), i16)); + dag LE_A1 = (i64 (sext_inreg + (i64 (anyext (i32 (vector_extract v8i16:$A, 4)))), i16)); + dag BE_A0 = (i64 (sext_inreg + (i64 (anyext (i32 (vector_extract v8i16:$A, 3)))), i16)); + dag BE_A1 = (i64 (sext_inreg + (i64 (anyext (i32 (vector_extract v8i16:$A, 7)))), i16)); } def WordToDWord { - dag A0 = (i64 (sext (i32 (vector_extract v4i32:$A, 0)))); - dag A1 = (i64 (sext (i32 (vector_extract v4i32:$A, 2)))); + dag LE_A0 = (i64 (sext (i32 (vector_extract v4i32:$A, 0)))); + dag LE_A1 = (i64 (sext (i32 (vector_extract v4i32:$A, 2)))); + dag BE_A0 = (i64 (sext (i32 (vector_extract v4i32:$A, 1)))); + dag BE_A1 = (i64 (sext (i32 (vector_extract v4i32:$A, 3)))); } def FltToIntLoad { @@ -3016,18 +3034,46 @@ // P9 Altivec instructions that can be used to build vectors. // Adding them to PPCInstrVSX.td rather than PPCAltivecVSX.td to compete // with complexities of existing build vector patterns in this file. - let Predicates = [HasP9Altivec] in { - def : Pat<(v2i64 (build_vector WordToDWord.A0, WordToDWord.A1)), + let Predicates = [HasP9Altivec, IsLittleEndian] in { + def : Pat<(v2i64 (build_vector WordToDWord.LE_A0, WordToDWord.LE_A1)), + (v2i64 (VEXTSW2D $A))>; + def : Pat<(v2i64 (build_vector HWordToDWord.LE_A0, HWordToDWord.LE_A1)), + (v2i64 (VEXTSH2D $A))>; + def : Pat<(v4i32 (build_vector HWordToWord.LE_A0, HWordToWord.LE_A1, + HWordToWord.LE_A2, HWordToWord.LE_A3)), + (v4i32 (VEXTSH2W $A))>; + def : Pat<(v4i32 (build_vector ByteToWord.LE_A0, ByteToWord.LE_A1, + ByteToWord.LE_A2, ByteToWord.LE_A3)), + (v4i32 (VEXTSB2W $A))>; + def : Pat<(v2i64 (build_vector ByteToDWord.LE_A0, ByteToDWord.LE_A1)), + (v2i64 (VEXTSB2D $A))>; + } + + let Predicates = [HasP9Altivec, IsBigEndian] in { + def : Pat<(v2i64 (build_vector WordToDWord.BE_A0, WordToDWord.BE_A1)), (v2i64 (VEXTSW2D $A))>; - def : Pat<(v2i64 (build_vector HWordToDWord.A0, HWordToDWord.A1)), + def : Pat<(v2i64 (build_vector HWordToDWord.BE_A0, HWordToDWord.BE_A1)), (v2i64 (VEXTSH2D $A))>; - def : Pat<(v4i32 (build_vector HWordToWord.A0, HWordToWord.A1, - HWordToWord.A2, HWordToWord.A3)), + def : Pat<(v4i32 (build_vector HWordToWord.BE_A0, HWordToWord.BE_A1, + HWordToWord.BE_A2, HWordToWord.BE_A3)), (v4i32 (VEXTSH2W $A))>; - def : Pat<(v4i32 (build_vector ByteToWord.A0, ByteToWord.A1, - ByteToWord.A2, ByteToWord.A3)), + def : Pat<(v4i32 (build_vector ByteToWord.BE_A0, ByteToWord.BE_A1, + ByteToWord.BE_A2, ByteToWord.BE_A3)), (v4i32 (VEXTSB2W $A))>; - def : Pat<(v2i64 (build_vector ByteToDWord.A0, ByteToDWord.A1)), + def : Pat<(v2i64 (build_vector ByteToDWord.BE_A0, ByteToDWord.BE_A1)), (v2i64 (VEXTSB2D $A))>; } + + let Predicates = [HasP9Altivec] in { + def: Pat<(v2i64 (PPCSExtVElems v16i8:$A)), + (v2i64 (VEXTSB2D $A))>; + def: Pat<(v2i64 (PPCSExtVElems v8i16:$A)), + (v2i64 (VEXTSH2D $A))>; + def: Pat<(v2i64 (PPCSExtVElems v4i32:$A)), + (v2i64 (VEXTSW2D $A))>; + def: Pat<(v4i32 (PPCSExtVElems v16i8:$A)), + (v4i32 (VEXTSB2W $A))>; + def: Pat<(v4i32 (PPCSExtVElems v8i16:$A)), + (v4i32 (VEXTSH2W $A))>; + } } Index: llvm/trunk/test/CodeGen/PowerPC/vec_int_ext.ll =================================================================== --- llvm/trunk/test/CodeGen/PowerPC/vec_int_ext.ll +++ llvm/trunk/test/CodeGen/PowerPC/vec_int_ext.ll @@ -1,12 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -verify-machineinstrs -mcpu=pwr9 < %s | FileCheck %s -check-prefix=PWR9 -target triple = "powerpc64le-unknown-linux-gnu" +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-gnu-linux -mcpu=pwr9 < %s | FileCheck %s -check-prefix=CHECK-LE +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-gnu-linux -mcpu=pwr9 < %s | FileCheck %s -check-prefix=CHECK-BE + +define <4 x i32> @vextsb2wLE(<16 x i8> %a) { +; CHECK-LE-LABEL: vextsb2wLE: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: vextsb2w 2, 2 +; CHECK-LE-NEXT: blr +; CHECK-BE-LABEL: vextsb2wLE: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE: vperm 2, 2, 2, 3 +; CHECK-BE-NEXT: vextsb2w 2, 2 +; CHECK-BE-NEXT: blr -define <4 x i32> @vextsb2w(<16 x i8> %a) { -; PWR9-LABEL: vextsb2w: -; PWR9: # BB#0: # %entry -; PWR9-NEXT: vextsb2w 2, 2 -; PWR9-NEXT: blr entry: %vecext = extractelement <16 x i8> %a, i32 0 %conv = sext i8 %vecext to i32 @@ -23,11 +29,17 @@ ret <4 x i32> %vecinit9 } -define <2 x i64> @vextsb2d(<16 x i8> %a) { -; PWR9-LABEL: vextsb2d: -; PWR9: # BB#0: # %entry -; PWR9-NEXT: vextsb2d 2, 2 -; PWR9-NEXT: blr +define <2 x i64> @vextsb2dLE(<16 x i8> %a) { +; CHECK-LE-LABEL: vextsb2dLE: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: vextsb2d 2, 2 +; CHECK-LE-NEXT: blr +; CHECK-BE-LABEL: vextsb2dLE: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE: vperm 2, 2, 2, 3 +; CHECK-BE-NEXT: vextsb2d 2, 2 +; CHECK-BE-NEXT: blr + entry: %vecext = extractelement <16 x i8> %a, i32 0 %conv = sext i8 %vecext to i64 @@ -38,11 +50,17 @@ ret <2 x i64> %vecinit3 } -define <4 x i32> @vextsh2w(<8 x i16> %a) { -; PWR9-LABEL: vextsh2w: -; PWR9: # BB#0: # %entry -; PWR9-NEXT: vextsh2w 2, 2 -; PWR9-NEXT: blr +define <4 x i32> @vextsh2wLE(<8 x i16> %a) { +; CHECK-LE-LABEL: vextsh2wLE: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: vextsh2w 2, 2 +; CHECK-LE-NEXT: blr +; CHECK-BE-LABEL: vextsh2wLE: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE: vperm 2, 2, 2, 3 +; CHECK-BE-NEXT: vextsh2w 2, 2 +; CHECK-BE-NEXT: blr + entry: %vecext = extractelement <8 x i16> %a, i32 0 %conv = sext i16 %vecext to i32 @@ -59,11 +77,17 @@ ret <4 x i32> %vecinit9 } -define <2 x i64> @vextsh2d(<8 x i16> %a) { -; PWR9-LABEL: vextsh2d: -; PWR9: # BB#0: # %entry -; PWR9-NEXT: vextsh2d 2, 2 -; PWR9-NEXT: blr +define <2 x i64> @vextsh2dLE(<8 x i16> %a) { +; CHECK-LE-LABEL: vextsh2dLE: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: vextsh2d 2, 2 +; CHECK-LE-NEXT: blr +; CHECK-BE-LABEL: vextsh2dLE: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE: vperm 2, 2, 2, 3 +; CHECK-BE-NEXT: vextsh2d 2, 2 +; CHECK-BE-NEXT: blr + entry: %vecext = extractelement <8 x i16> %a, i32 0 %conv = sext i16 %vecext to i64 @@ -74,11 +98,17 @@ ret <2 x i64> %vecinit3 } -define <2 x i64> @vextsw2d(<4 x i32> %a) { -; PWR9-LABEL: vextsw2d: -; PWR9: # BB#0: # %entry -; PWR9-NEXT: vextsw2d 2, 2 -; PWR9-NEXT: blr +define <2 x i64> @vextsw2dLE(<4 x i32> %a) { +; CHECK-LE-LABEL: vextsw2dLE: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: vextsw2d 2, 2 +; CHECK-LE-NEXT: blr +; CHECK-BE-LABEL: vextsw2dLE: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE: vmrgew +; CHECK-BE-NEXT: vextsw2d 2, 2 +; CHECK-BE-NEXT: blr + entry: %vecext = extractelement <4 x i32> %a, i32 0 %conv = sext i32 %vecext to i64 @@ -88,3 +118,170 @@ %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1 ret <2 x i64> %vecinit3 } + +define <4 x i32> @vextsb2wBE(<16 x i8> %a) { +; CHECK-BE-LABEL: vextsb2wBE: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE-NEXT: vextsb2w 2, 2 +; CHECK-BE-NEXT: blr +; CHECK-LE-LABEL: vextsb2wBE: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: vsldoi 2, 2, 2, 13 +; CHECK-LE-NEXT: vextsb2w 2, 2 +; CHECK-LE-NEXT: blr +entry: + %vecext = extractelement <16 x i8> %a, i32 3 + %conv = sext i8 %vecext to i32 + %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0 + %vecext1 = extractelement <16 x i8> %a, i32 7 + %conv2 = sext i8 %vecext1 to i32 + %vecinit3 = insertelement <4 x i32> %vecinit, i32 %conv2, i32 1 + %vecext4 = extractelement <16 x i8> %a, i32 11 + %conv5 = sext i8 %vecext4 to i32 + %vecinit6 = insertelement <4 x i32> %vecinit3, i32 %conv5, i32 2 + %vecext7 = extractelement <16 x i8> %a, i32 15 + %conv8 = sext i8 %vecext7 to i32 + %vecinit9 = insertelement <4 x i32> %vecinit6, i32 %conv8, i32 3 + ret <4 x i32> %vecinit9 +} + +define <2 x i64> @vextsb2dBE(<16 x i8> %a) { +; CHECK-BE-LABEL: vextsb2dBE: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE-NEXT: vextsb2d 2, 2 +; CHECK-BE-NEXT: blr +; CHECK-LE-LABEL: vextsb2dBE: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: vsldoi 2, 2, 2, 9 +; CHECK-LE-NEXT: vextsb2d 2, 2 +; CHECK-LE-NEXT: blr +entry: + %vecext = extractelement <16 x i8> %a, i32 7 + %conv = sext i8 %vecext to i64 + %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0 + %vecext1 = extractelement <16 x i8> %a, i32 15 + %conv2 = sext i8 %vecext1 to i64 + %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1 + ret <2 x i64> %vecinit3 +} + +define <4 x i32> @vextsh2wBE(<8 x i16> %a) { +; CHECK-BE-LABEL: vextsh2wBE: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE-NEXT: vextsh2w 2, 2 +; CHECK-BE-NEXT: blr +; CHECK-LE-LABEL: vextsh2wBE: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: vsldoi 2, 2, 2, 14 +; CHECK-LE-NEXT: vextsh2w 2, 2 +; CHECK-LE-NEXT: blr +entry: + %vecext = extractelement <8 x i16> %a, i32 1 + %conv = sext i16 %vecext to i32 + %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0 + %vecext1 = extractelement <8 x i16> %a, i32 3 + %conv2 = sext i16 %vecext1 to i32 + %vecinit3 = insertelement <4 x i32> %vecinit, i32 %conv2, i32 1 + %vecext4 = extractelement <8 x i16> %a, i32 5 + %conv5 = sext i16 %vecext4 to i32 + %vecinit6 = insertelement <4 x i32> %vecinit3, i32 %conv5, i32 2 + %vecext7 = extractelement <8 x i16> %a, i32 7 + %conv8 = sext i16 %vecext7 to i32 + %vecinit9 = insertelement <4 x i32> %vecinit6, i32 %conv8, i32 3 + ret <4 x i32> %vecinit9 +} + +define <2 x i64> @vextsh2dBE(<8 x i16> %a) { +; CHECK-BE-LABEL: vextsh2dBE: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE-NEXT: vextsh2d 2, 2 +; CHECK-BE-NEXT: blr +; CHECK-LE-LABEL: vextsh2dBE: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: vsldoi 2, 2, 2, 10 +; CHECK-LE-NEXT: vextsh2d 2, 2 +; CHECK-LE-NEXT: blr +entry: + %vecext = extractelement <8 x i16> %a, i32 3 + %conv = sext i16 %vecext to i64 + %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0 + %vecext1 = extractelement <8 x i16> %a, i32 7 + %conv2 = sext i16 %vecext1 to i64 + %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1 + ret <2 x i64> %vecinit3 +} + +define <2 x i64> @vextsw2dBE(<4 x i32> %a) { +; CHECK-BE-LABEL: vextsw2dBE: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE-NEXT: vextsw2d 2, 2 +; CHECK-BE-NEXT: blr +; CHECK-LE-LABEL: vextsw2dBE: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NEXT: vsldoi 2, 2, 2, 12 +; CHECK-LE-NEXT: vextsw2d 2, 2 +; CHECK-LE-NEXT: blr +entry: + %vecext = extractelement <4 x i32> %a, i32 1 + %conv = sext i32 %vecext to i64 + %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0 + %vecext1 = extractelement <4 x i32> %a, i32 3 + %conv2 = sext i32 %vecext1 to i64 + %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1 + ret <2 x i64> %vecinit3 +} + +define <2 x i64> @vextDiffVectors(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LE-LABEL: vextDiffVectors: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NOT: vextsw2d + +; CHECK-BE-LABEL: vextDiffVectors: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE-NOT: vextsw2d +entry: + %vecext = extractelement <4 x i32> %a, i32 0 + %conv = sext i32 %vecext to i64 + %vecinit = insertelement <2 x i64> undef, i64 %conv, i32 0 + %vecext1 = extractelement <4 x i32> %b, i32 2 + %conv2 = sext i32 %vecext1 to i64 + %vecinit3 = insertelement <2 x i64> %vecinit, i64 %conv2, i32 1 + ret <2 x i64> %vecinit3 +} + +define <8 x i16> @testInvalidExtend(<16 x i8> %a) { +entry: +; CHECK-LE-LABEL: testInvalidExtend: +; CHECK-LE: # BB#0: # %entry +; CHECK-LE-NOT: vexts + +; CHECK-BE-LABEL: testInvalidExtend: +; CHECK-BE: # BB#0: # %entry +; CHECK-BE-NOT: vexts + + %vecext = extractelement <16 x i8> %a, i32 0 + %conv = sext i8 %vecext to i16 + %vecinit = insertelement <8 x i16> undef, i16 %conv, i32 0 + %vecext1 = extractelement <16 x i8> %a, i32 2 + %conv2 = sext i8 %vecext1 to i16 + %vecinit3 = insertelement <8 x i16> %vecinit, i16 %conv2, i32 1 + %vecext4 = extractelement <16 x i8> %a, i32 4 + %conv5 = sext i8 %vecext4 to i16 + %vecinit6 = insertelement <8 x i16> %vecinit3, i16 %conv5, i32 2 + %vecext7 = extractelement <16 x i8> %a, i32 6 + %conv8 = sext i8 %vecext7 to i16 + %vecinit9 = insertelement <8 x i16> %vecinit6, i16 %conv8, i32 3 + %vecext10 = extractelement <16 x i8> %a, i32 8 + %conv11 = sext i8 %vecext10 to i16 + %vecinit12 = insertelement <8 x i16> %vecinit9, i16 %conv11, i32 4 + %vecext13 = extractelement <16 x i8> %a, i32 10 + %conv14 = sext i8 %vecext13 to i16 + %vecinit15 = insertelement <8 x i16> %vecinit12, i16 %conv14, i32 5 + %vecext16 = extractelement <16 x i8> %a, i32 12 + %conv17 = sext i8 %vecext16 to i16 + %vecinit18 = insertelement <8 x i16> %vecinit15, i16 %conv17, i32 6 + %vecext19 = extractelement <16 x i8> %a, i32 14 + %conv20 = sext i8 %vecext19 to i16 + %vecinit21 = insertelement <8 x i16> %vecinit18, i16 %conv20, i32 7 + ret <8 x i16> %vecinit21 +}