Index: lib/Target/Mips/MipsISelDAGToDAG.h =================================================================== --- lib/Target/Mips/MipsISelDAGToDAG.h +++ lib/Target/Mips/MipsISelDAGToDAG.h @@ -84,7 +84,8 @@ SDValue &Offset, SDValue &Alias); /// \brief Select constant vector splats. - virtual bool selectVSplat(SDNode *N, APInt &Imm) const; + virtual bool selectVSplat(SDNode *N, APInt &Imm, + unsigned MinSizeInBits) const; /// \brief Select constant vector splats whose value fits in a uimm1. virtual bool selectVSplatUimm1(SDValue N, SDValue &Imm) const; /// \brief Select constant vector splats whose value fits in a uimm2. Index: lib/Target/Mips/MipsISelDAGToDAG.cpp =================================================================== --- lib/Target/Mips/MipsISelDAGToDAG.cpp +++ lib/Target/Mips/MipsISelDAGToDAG.cpp @@ -113,7 +113,8 @@ return false; } -bool MipsDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm) const { +bool MipsDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm, + unsigned MinSizeInBits) const { llvm_unreachable("Unimplemented function."); return false; } Index: lib/Target/Mips/MipsMSAInstrInfo.td =================================================================== --- lib/Target/Mips/MipsMSAInstrInfo.td +++ lib/Target/Mips/MipsMSAInstrInfo.td @@ -375,7 +375,7 @@ APInt Imm; EVT EltTy = N->getValueType(0).getVectorElementType(); - return selectVSplat (N, Imm) && + return selectVSplat(N, Imm, EltTy.getSizeInBits()) && Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 1; }]>; @@ -384,7 +384,7 @@ SDNode *BV = N->getOperand(0).getNode(); EVT EltTy = N->getValueType(0).getVectorElementType(); - return selectVSplat (BV, Imm) && + return selectVSplat(BV, Imm, EltTy.getSizeInBits()) && Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 1; }]>; Index: lib/Target/Mips/MipsSEISelDAGToDAG.h =================================================================== --- lib/Target/Mips/MipsSEISelDAGToDAG.h +++ lib/Target/Mips/MipsSEISelDAGToDAG.h @@ -78,7 +78,8 @@ SDValue &Offset) const override; /// \brief Select constant vector splats. - bool selectVSplat(SDNode *N, APInt &Imm) const override; + bool selectVSplat(SDNode *N, APInt &Imm, + unsigned MinSizeInBits) const override; /// \brief Select constant vector splats whose value fits in a given integer. bool selectVSplatCommon(SDValue N, SDValue &Imm, bool Signed, unsigned ImmBitSize) const; Index: lib/Target/Mips/MipsSEISelDAGToDAG.cpp =================================================================== --- lib/Target/Mips/MipsSEISelDAGToDAG.cpp +++ lib/Target/Mips/MipsSEISelDAGToDAG.cpp @@ -471,7 +471,8 @@ // Returns true and sets Imm if: // * MSA is enabled // * N is a ISD::BUILD_VECTOR representing a constant splat -bool MipsSEDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm) const { +bool MipsSEDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm, + unsigned MinSizeInBits) const { if (!Subtarget->hasMSA()) return false; @@ -484,9 +485,8 @@ unsigned SplatBitSize; bool HasAnyUndefs; - if (!Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, - HasAnyUndefs, 8, - !Subtarget->isLittle())) + if (!Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs, + MinSizeInBits, !Subtarget->isLittle())) return false; Imm = SplatValue; @@ -519,8 +519,9 @@ if (N->getOpcode() == ISD::BITCAST) N = N->getOperand(0); - if (selectVSplat (N.getNode(), ImmValue) && + if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) && ImmValue.getBitWidth() == EltTy.getSizeInBits()) { + if (( Signed && ImmValue.isSignedIntN(ImmBitSize)) || (!Signed && ImmValue.isIntN(ImmBitSize))) { Imm = CurDAG->getTargetConstant(ImmValue, EltTy); @@ -594,7 +595,7 @@ if (N->getOpcode() == ISD::BITCAST) N = N->getOperand(0); - if (selectVSplat (N.getNode(), ImmValue) && + if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) && ImmValue.getBitWidth() == EltTy.getSizeInBits()) { int32_t Log2 = ImmValue.exactLogBase2(); @@ -625,7 +626,7 @@ if (N->getOpcode() == ISD::BITCAST) N = N->getOperand(0); - if (selectVSplat(N.getNode(), ImmValue) && + if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) && ImmValue.getBitWidth() == EltTy.getSizeInBits()) { // Extract the run of set bits starting with bit zero from the bitwise // inverse of ImmValue, and test that the inverse of this is the same @@ -658,7 +659,7 @@ if (N->getOpcode() == ISD::BITCAST) N = N->getOperand(0); - if (selectVSplat(N.getNode(), ImmValue) && + if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) && ImmValue.getBitWidth() == EltTy.getSizeInBits()) { // Extract the run of set bits starting with bit zero, and test that the // result is the same as the original value @@ -679,7 +680,7 @@ if (N->getOpcode() == ISD::BITCAST) N = N->getOperand(0); - if (selectVSplat(N.getNode(), ImmValue) && + if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) && ImmValue.getBitWidth() == EltTy.getSizeInBits()) { int32_t Log2 = (~ImmValue).exactLogBase2(); Index: lib/Target/Mips/MipsSEISelLowering.cpp =================================================================== --- lib/Target/Mips/MipsSEISelLowering.cpp +++ lib/Target/Mips/MipsSEISelLowering.cpp @@ -2407,7 +2407,7 @@ // It is therefore possible to lower into SHF when the mask takes the form: // // When undef's appear they are treated as if they were whatever value is -// necessary in order to fit the above form. +// necessary in order to fit the above forms. // // For example: // %2 = shufflevector <8 x i16> %0, <8 x i16> undef, @@ -2465,177 +2465,326 @@ DAG.getConstant(Imm, MVT::i32), Op->getOperand(0)); } +/// Determine whether a range fits a regular pattern of values. +/// This function accounts for the possibility of jumping over the End iterator. +template +static bool +fitsRegularPattern(typename SmallVectorImpl::const_iterator Begin, + unsigned CheckStride, + typename SmallVectorImpl::const_iterator End, + ValType ExpectedIndex, unsigned ExpectedIndexStride) { + auto &I = Begin; + + while (I != End) { + if (*I != -1 && *I != ExpectedIndex) + return false; + ExpectedIndex += ExpectedIndexStride; + + // Incrementing past End is undefined behaviour so we must increment one + // step at a time and check for End at each step. + for (unsigned n = 0; n < CheckStride && I != End; ++n, ++I) + ; // Empty loop body. + } + return true; +} + +// Determine whether VECTOR_SHUFFLE is a SPLATI. +// +// It is a SPLATI when the mask is: +// +// where x is any valid index. +// +// When undef's appear in the mask they are treated as if they were whatever +// value is necessary in order to fit the above form. +static bool isVECTOR_SHUFFLE_SPLATI(SDValue Op, EVT ResTy, + SmallVector Indices, + SelectionDAG &DAG) { + assert((Indices.size() % 2) == 0); + + int SplatIndex = -1; + for (const auto &V : Indices) { + if (V != -1) { + SplatIndex = V; + break; + } + } + + return fitsRegularPattern(Indices.begin(), 1, Indices.end(), SplatIndex, + 0); +} + // Lower VECTOR_SHUFFLE into ILVEV (if possible). // // ILVEV interleaves the even elements from each vector. // -// It is possible to lower into ILVEV when the mask takes the form: -// <0, n, 2, n+2, 4, n+4, ...> +// It is possible to lower into ILVEV when the mask consists of two of the +// following forms interleaved: +// <0, 2, 4, ...> +// // where n is the number of elements in the vector. +// For example: +// <0, 0, 2, 2, 4, 4, ...> +// <0, n, 2, n+2, 4, n+4, ...> // // When undef's appear in the mask they are treated as if they were whatever -// value is necessary in order to fit the above form. +// value is necessary in order to fit the above forms. static SDValue lowerVECTOR_SHUFFLE_ILVEV(SDValue Op, EVT ResTy, SmallVector Indices, SelectionDAG &DAG) { - assert ((Indices.size() % 2) == 0); - int WsIdx = 0; - int WtIdx = ResTy.getVectorNumElements(); + assert((Indices.size() % 2) == 0); + + SDValue Wt; + SDValue Ws; + const auto &Begin = Indices.begin(); + const auto &End = Indices.end(); + + // Check even elements are taken from the even elements of one half or the + // other and pick an operand accordingly. + if (fitsRegularPattern(Begin, 2, End, 0, 2)) + Wt = Op->getOperand(0); + else if (fitsRegularPattern(Begin, 2, End, Indices.size(), 2)) + Wt = Op->getOperand(1); + else + return SDValue(); - for (unsigned i = 0; i < Indices.size(); i += 2) { - if (Indices[i] != -1 && Indices[i] != WsIdx) - return SDValue(); - if (Indices[i+1] != -1 && Indices[i+1] != WtIdx) - return SDValue(); - WsIdx += 2; - WtIdx += 2; - } + // Check odd elements are taken from the even elements of one half or the + // other and pick an operand accordingly. + if (fitsRegularPattern(Begin + 1, 2, End, 0, 2)) + Ws = Op->getOperand(0); + else if (fitsRegularPattern(Begin + 1, 2, End, Indices.size(), 2)) + Ws = Op->getOperand(1); + else + return SDValue(); - return DAG.getNode(MipsISD::ILVEV, SDLoc(Op), ResTy, Op->getOperand(0), - Op->getOperand(1)); + return DAG.getNode(MipsISD::ILVEV, SDLoc(Op), ResTy, Ws, Wt); } // Lower VECTOR_SHUFFLE into ILVOD (if possible). // // ILVOD interleaves the odd elements from each vector. // -// It is possible to lower into ILVOD when the mask takes the form: -// <1, n+1, 3, n+3, 5, n+5, ...> +// It is possible to lower into ILVOD when the mask consists of two of the +// following forms interleaved: +// <1, 3, 5, ...> +// // where n is the number of elements in the vector. +// For example: +// <1, 1, 3, 3, 5, 5, ...> +// <1, n+1, 3, n+3, 5, n+5, ...> // // When undef's appear in the mask they are treated as if they were whatever -// value is necessary in order to fit the above form. +// value is necessary in order to fit the above forms. static SDValue lowerVECTOR_SHUFFLE_ILVOD(SDValue Op, EVT ResTy, SmallVector Indices, SelectionDAG &DAG) { - assert ((Indices.size() % 2) == 0); - int WsIdx = 1; - int WtIdx = ResTy.getVectorNumElements() + 1; + assert((Indices.size() % 2) == 0); + + SDValue Wt; + SDValue Ws; + const auto &Begin = Indices.begin(); + const auto &End = Indices.end(); + + // Check even elements are taken from the odd elements of one half or the + // other and pick an operand accordingly. + if (fitsRegularPattern(Begin, 2, End, 1, 2)) + Wt = Op->getOperand(0); + else if (fitsRegularPattern(Begin, 2, End, Indices.size() + 1, 2)) + Wt = Op->getOperand(1); + else + return SDValue(); - for (unsigned i = 0; i < Indices.size(); i += 2) { - if (Indices[i] != -1 && Indices[i] != WsIdx) - return SDValue(); - if (Indices[i+1] != -1 && Indices[i+1] != WtIdx) - return SDValue(); - WsIdx += 2; - WtIdx += 2; - } + // Check odd elements are taken from the odd elements of one half or the + // other and pick an operand accordingly. + if (fitsRegularPattern(Begin + 1, 2, End, 1, 2)) + Ws = Op->getOperand(0); + else if (fitsRegularPattern(Begin + 1, 2, End, Indices.size() + 1, 2)) + Ws = Op->getOperand(1); + else + return SDValue(); - return DAG.getNode(MipsISD::ILVOD, SDLoc(Op), ResTy, Op->getOperand(0), - Op->getOperand(1)); + return DAG.getNode(MipsISD::ILVOD, SDLoc(Op), ResTy, Wt, Ws); } -// Lower VECTOR_SHUFFLE into ILVL (if possible). +// Lower VECTOR_SHUFFLE into ILVR (if possible). // -// ILVL interleaves consecutive elements from the left half of each vector. +// ILVR interleaves consecutive elements from the right (lowest-indexed) half of +// each vector. // -// It is possible to lower into ILVL when the mask takes the form: -// <0, n, 1, n+1, 2, n+2, ...> +// It is possible to lower into ILVR when the mask consists of two of the +// following forms interleaved: +// <0, 1, 2, ...> +// // where n is the number of elements in the vector. +// For example: +// <0, 0, 1, 1, 2, 2, ...> +// <0, n, 1, n+1, 2, n+2, ...> // // When undef's appear in the mask they are treated as if they were whatever -// value is necessary in order to fit the above form. -static SDValue lowerVECTOR_SHUFFLE_ILVL(SDValue Op, EVT ResTy, +// value is necessary in order to fit the above forms. +static SDValue lowerVECTOR_SHUFFLE_ILVR(SDValue Op, EVT ResTy, SmallVector Indices, SelectionDAG &DAG) { - assert ((Indices.size() % 2) == 0); - int WsIdx = 0; - int WtIdx = ResTy.getVectorNumElements(); + assert((Indices.size() % 2) == 0); + + SDValue Wt; + SDValue Ws; + const auto &Begin = Indices.begin(); + const auto &End = Indices.end(); + + // Check even elements are taken from the right (lowest-indexed) elements of + // one half or the other and pick an operand accordingly. + if (fitsRegularPattern(Begin, 2, End, 0, 1)) + Wt = Op->getOperand(0); + else if (fitsRegularPattern(Begin, 2, End, Indices.size(), 1)) + Wt = Op->getOperand(1); + else + return SDValue(); - for (unsigned i = 0; i < Indices.size(); i += 2) { - if (Indices[i] != -1 && Indices[i] != WsIdx) - return SDValue(); - if (Indices[i+1] != -1 && Indices[i+1] != WtIdx) - return SDValue(); - WsIdx ++; - WtIdx ++; - } + // Check odd elements are taken from the right (lowest-indexed) elements of + // one half or the other and pick an operand accordingly. + if (fitsRegularPattern(Begin + 1, 2, End, 0, 1)) + Ws = Op->getOperand(0); + else if (fitsRegularPattern(Begin + 1, 2, End, Indices.size(), 1)) + Ws = Op->getOperand(1); + else + return SDValue(); - return DAG.getNode(MipsISD::ILVL, SDLoc(Op), ResTy, Op->getOperand(0), - Op->getOperand(1)); + return DAG.getNode(MipsISD::ILVR, SDLoc(Op), ResTy, Ws, Wt); } -// Lower VECTOR_SHUFFLE into ILVR (if possible). +// Lower VECTOR_SHUFFLE into ILVL (if possible). // -// ILVR interleaves consecutive elements from the right half of each vector. +// ILVL interleaves consecutive elements from the left (highest-indexed) half +// of each vector. // -// It is possible to lower into ILVR when the mask takes the form: -// +// It is possible to lower into ILVL when the mask consists of two of the +// following forms interleaved: +// +// // where n is the number of elements in the vector and x is half n. +// For example: +// +// // // When undef's appear in the mask they are treated as if they were whatever -// value is necessary in order to fit the above form. -static SDValue lowerVECTOR_SHUFFLE_ILVR(SDValue Op, EVT ResTy, +// value is necessary in order to fit the above forms. +static SDValue lowerVECTOR_SHUFFLE_ILVL(SDValue Op, EVT ResTy, SmallVector Indices, SelectionDAG &DAG) { - assert ((Indices.size() % 2) == 0); - unsigned NumElts = ResTy.getVectorNumElements(); - int WsIdx = NumElts / 2; - int WtIdx = NumElts + NumElts / 2; + assert((Indices.size() % 2) == 0); + + unsigned HalfSize = Indices.size() / 2; + SDValue Wt; + SDValue Ws; + const auto &Begin = Indices.begin(); + const auto &End = Indices.end(); + + // Check even elements are taken from the left (highest-indexed) elements of + // one half or the other and pick an operand accordingly. + if (fitsRegularPattern(Begin, 2, End, HalfSize, 1)) + Wt = Op->getOperand(0); + else if (fitsRegularPattern(Begin, 2, End, Indices.size() + HalfSize, 1)) + Wt = Op->getOperand(1); + else + return SDValue(); - for (unsigned i = 0; i < Indices.size(); i += 2) { - if (Indices[i] != -1 && Indices[i] != WsIdx) - return SDValue(); - if (Indices[i+1] != -1 && Indices[i+1] != WtIdx) - return SDValue(); - WsIdx ++; - WtIdx ++; - } + // Check odd elements are taken from the left (highest-indexed) elements of + // one half or the other and pick an operand accordingly. + if (fitsRegularPattern(Begin + 1, 2, End, HalfSize, 1)) + Ws = Op->getOperand(0); + else if (fitsRegularPattern(Begin + 1, 2, End, Indices.size() + HalfSize, + 1)) + Ws = Op->getOperand(1); + else + return SDValue(); - return DAG.getNode(MipsISD::ILVR, SDLoc(Op), ResTy, Op->getOperand(0), - Op->getOperand(1)); + return DAG.getNode(MipsISD::ILVL, SDLoc(Op), ResTy, Ws, Wt); } // Lower VECTOR_SHUFFLE into PCKEV (if possible). // // PCKEV copies the even elements of each vector into the result vector. // -// It is possible to lower into PCKEV when the mask takes the form: -// <0, 2, 4, ..., n, n+2, n+4, ...> +// It is possible to lower into PCKEV when the mask consists of two of the +// following forms concatenated: +// <0, 2, 4, ...> +// // where n is the number of elements in the vector. +// For example: +// <0, 2, 4, ..., 0, 2, 4, ...> +// <0, 2, 4, ..., n, n+2, n+4, ...> // // When undef's appear in the mask they are treated as if they were whatever -// value is necessary in order to fit the above form. +// value is necessary in order to fit the above forms. static SDValue lowerVECTOR_SHUFFLE_PCKEV(SDValue Op, EVT ResTy, SmallVector Indices, SelectionDAG &DAG) { - assert ((Indices.size() % 2) == 0); - int Idx = 0; + assert((Indices.size() % 2) == 0); + + SDValue Wt; + SDValue Ws; + const auto &Begin = Indices.begin(); + const auto &Mid = Indices.begin() + Indices.size() / 2; + const auto &End = Indices.end(); + + if (fitsRegularPattern(Begin, 1, Mid, 0, 2)) + Wt = Op->getOperand(0); + else if (fitsRegularPattern(Begin, 1, Mid, Indices.size(), 2)) + Wt = Op->getOperand(1); + else + return SDValue(); - for (unsigned i = 0; i < Indices.size(); ++i) { - if (Indices[i] != -1 && Indices[i] != Idx) - return SDValue(); - Idx += 2; - } + if (fitsRegularPattern(Mid, 1, End, 0, 2)) + Ws = Op->getOperand(0); + else if (fitsRegularPattern(Mid, 1, End, Indices.size(), 2)) + Ws = Op->getOperand(1); + else + return SDValue(); - return DAG.getNode(MipsISD::PCKEV, SDLoc(Op), ResTy, Op->getOperand(0), - Op->getOperand(1)); + return DAG.getNode(MipsISD::PCKEV, SDLoc(Op), ResTy, Ws, Wt); } // Lower VECTOR_SHUFFLE into PCKOD (if possible). // // PCKOD copies the odd elements of each vector into the result vector. // -// It is possible to lower into PCKOD when the mask takes the form: -// <1, 3, 5, ..., n+1, n+3, n+5, ...> +// It is possible to lower into PCKOD when the mask consists of two of the +// following forms concatenated: +// <1, 3, 5, ...> +// // where n is the number of elements in the vector. +// For example: +// <1, 3, 5, ..., 1, 3, 5, ...> +// <1, 3, 5, ..., n+1, n+3, n+5, ...> // // When undef's appear in the mask they are treated as if they were whatever -// value is necessary in order to fit the above form. +// value is necessary in order to fit the above forms. static SDValue lowerVECTOR_SHUFFLE_PCKOD(SDValue Op, EVT ResTy, SmallVector Indices, SelectionDAG &DAG) { - assert ((Indices.size() % 2) == 0); - int Idx = 1; + assert((Indices.size() % 2) == 0); + + SDValue Wt; + SDValue Ws; + const auto &Begin = Indices.begin(); + const auto &Mid = Indices.begin() + Indices.size() / 2; + const auto &End = Indices.end(); + + if (fitsRegularPattern(Begin, 1, Mid, 1, 2)) + Wt = Op->getOperand(0); + else if (fitsRegularPattern(Begin, 1, Mid, Indices.size() + 1, 2)) + Wt = Op->getOperand(1); + else + return SDValue(); - for (unsigned i = 0; i < Indices.size(); ++i) { - if (Indices[i] != -1 && Indices[i] != Idx) - return SDValue(); - Idx += 2; - } + if (fitsRegularPattern(Mid, 1, End, 1, 2)) + Ws = Op->getOperand(0); + else if (fitsRegularPattern(Mid, 1, End, Indices.size() + 1, 2)) + Ws = Op->getOperand(1); + else + return SDValue(); - return DAG.getNode(MipsISD::PCKOD, SDLoc(Op), ResTy, Op->getOperand(0), - Op->getOperand(1)); + return DAG.getNode(MipsISD::PCKOD, SDLoc(Op), ResTy, Ws, Wt); } // Lower VECTOR_SHUFFLE into VSHF. @@ -2711,10 +2860,11 @@ for (int i = 0; i < ResTyNumElts; ++i) Indices.push_back(Node->getMaskElt(i)); - SDValue Result = lowerVECTOR_SHUFFLE_SHF(Op, ResTy, Indices, DAG); - if (Result.getNode()) - return Result; - Result = lowerVECTOR_SHUFFLE_ILVEV(Op, ResTy, Indices, DAG); + // splati.[bhwd] is preferable to the others but is matched from + // MipsISD::VSHF. + if (isVECTOR_SHUFFLE_SPLATI(Op, ResTy, Indices, DAG)) + return lowerVECTOR_SHUFFLE_VSHF(Op, ResTy, Indices, DAG); + SDValue Result = lowerVECTOR_SHUFFLE_ILVEV(Op, ResTy, Indices, DAG); if (Result.getNode()) return Result; Result = lowerVECTOR_SHUFFLE_ILVOD(Op, ResTy, Indices, DAG); @@ -2732,6 +2882,9 @@ Result = lowerVECTOR_SHUFFLE_PCKOD(Op, ResTy, Indices, DAG); if (Result.getNode()) return Result; + Result = lowerVECTOR_SHUFFLE_SHF(Op, ResTy, Indices, DAG); + if (Result.getNode()) + return Result; return lowerVECTOR_SHUFFLE_VSHF(Op, ResTy, Indices, DAG); } Index: test/CodeGen/Mips/msa/shuffle.ll =================================================================== --- test/CodeGen/Mips/msa/shuffle.ll +++ test/CodeGen/Mips/msa/shuffle.ll @@ -2,7 +2,7 @@ ; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s define void @vshf_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { - ; CHECK: vshf_v16i8_0: + ; CHECK-LABEL: vshf_v16i8_0: %1 = load <16 x i8>, <16 x i8>* %a ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) @@ -14,11 +14,10 @@ ; CHECK-DAG: st.b [[R3]], 0($4) ret void - ; CHECK: .size vshf_v16i8_0 } define void @vshf_v16i8_1(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { - ; CHECK: vshf_v16i8_1: + ; CHECK-LABEL: vshf_v16i8_1: %1 = load <16 x i8>, <16 x i8>* %a ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) @@ -28,11 +27,10 @@ ; CHECK-DAG: st.b [[R3]], 0($4) ret void - ; CHECK: .size vshf_v16i8_1 } define void @vshf_v16i8_2(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { - ; CHECK: vshf_v16i8_2: + ; CHECK-LABEL: vshf_v16i8_2: %1 = load <16 x i8>, <16 x i8>* %a %2 = load <16 x i8>, <16 x i8>* %b @@ -45,11 +43,10 @@ ; CHECK-DAG: st.b [[R3]], 0($4) ret void - ; CHECK: .size vshf_v16i8_2 } define void @vshf_v16i8_3(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { - ; CHECK: vshf_v16i8_3: + ; CHECK-LABEL: vshf_v16i8_3: %1 = load <16 x i8>, <16 x i8>* %a ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) @@ -65,11 +62,10 @@ ; CHECK-DAG: st.b [[R3]], 0($4) ret void - ; CHECK: .size vshf_v16i8_3 } define void @vshf_v16i8_4(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { - ; CHECK: vshf_v16i8_4: + ; CHECK-LABEL: vshf_v16i8_4: %1 = load <16 x i8>, <16 x i8>* %a ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) @@ -79,11 +75,10 @@ ; CHECK-DAG: st.b [[R3]], 0($4) ret void - ; CHECK: .size vshf_v16i8_4 } define void @vshf_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { - ; CHECK: vshf_v8i16_0: + ; CHECK-LABEL: vshf_v8i16_0: %1 = load <8 x i16>, <8 x i16>* %a ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) @@ -95,11 +90,10 @@ ; CHECK-DAG: st.h [[R3]], 0($4) ret void - ; CHECK: .size vshf_v8i16_0 } define void @vshf_v8i16_1(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { - ; CHECK: vshf_v8i16_1: + ; CHECK-LABEL: vshf_v8i16_1: %1 = load <8 x i16>, <8 x i16>* %a ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) @@ -109,11 +103,10 @@ ; CHECK-DAG: st.h [[R3]], 0($4) ret void - ; CHECK: .size vshf_v8i16_1 } define void @vshf_v8i16_2(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { - ; CHECK: vshf_v8i16_2: + ; CHECK-LABEL: vshf_v8i16_2: %1 = load <8 x i16>, <8 x i16>* %a %2 = load <8 x i16>, <8 x i16>* %b @@ -126,11 +119,10 @@ ; CHECK-DAG: st.h [[R3]], 0($4) ret void - ; CHECK: .size vshf_v8i16_2 } define void @vshf_v8i16_3(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { - ; CHECK: vshf_v8i16_3: + ; CHECK-LABEL: vshf_v8i16_3: %1 = load <8 x i16>, <8 x i16>* %a ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) @@ -146,11 +138,10 @@ ; CHECK-DAG: st.h [[R3]], 0($4) ret void - ; CHECK: .size vshf_v8i16_3 } define void @vshf_v8i16_4(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { - ; CHECK: vshf_v8i16_4: + ; CHECK-LABEL: vshf_v8i16_4: %1 = load <8 x i16>, <8 x i16>* %a ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) @@ -160,14 +151,13 @@ ; CHECK-DAG: st.h [[R3]], 0($4) ret void - ; CHECK: .size vshf_v8i16_4 } ; Note: v4i32 only has one 4-element set so it's impossible to get a vshf.w ; instruction when using a single vector. define void @vshf_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { - ; CHECK: vshf_v4i32_0: + ; CHECK-LABEL: vshf_v4i32_0: %1 = load <4 x i32>, <4 x i32>* %a ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) @@ -177,25 +167,23 @@ ; CHECK-DAG: st.w [[R3]], 0($4) ret void - ; CHECK: .size vshf_v4i32_0 } define void @vshf_v4i32_1(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { - ; CHECK: vshf_v4i32_1: + ; CHECK-LABEL: vshf_v4i32_1: %1 = load <4 x i32>, <4 x i32>* %a ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> - ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 85 + ; CHECK-DAG: splati.w [[R3:\$w[0-9]+]], [[R1]][1] store <4 x i32> %2, <4 x i32>* %c ; CHECK-DAG: st.w [[R3]], 0($4) ret void - ; CHECK: .size vshf_v4i32_1 } define void @vshf_v4i32_2(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { - ; CHECK: vshf_v4i32_2: + ; CHECK-LABEL: vshf_v4i32_2: %1 = load <4 x i32>, <4 x i32>* %a %2 = load <4 x i32>, <4 x i32>* %b @@ -206,11 +194,10 @@ ; CHECK-DAG: st.w [[R3]], 0($4) ret void - ; CHECK: .size vshf_v4i32_2 } define void @vshf_v4i32_3(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { - ; CHECK: vshf_v4i32_3: + ; CHECK-LABEL: vshf_v4i32_3: %1 = load <4 x i32>, <4 x i32>* %a ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) @@ -226,25 +213,24 @@ ; CHECK-DAG: st.w [[R3]], 0($4) ret void - ; CHECK: .size vshf_v4i32_3 } define void @vshf_v4i32_4(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { - ; CHECK: vshf_v4i32_4: + ; CHECK-LABEL: vshf_v4i32_4: %1 = load <4 x i32>, <4 x i32>* %a ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) %2 = shufflevector <4 x i32> %1, <4 x i32> %1, <4 x i32> - ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 85 + ; The two operand vectors are the same so element 1 and 5 are equivalent. + ; CHECK-DAG: splati.w [[R3:\$w[0-9]+]], [[R1]][1] store <4 x i32> %2, <4 x i32>* %c ; CHECK-DAG: st.w [[R3]], 0($4) ret void - ; CHECK: .size vshf_v4i32_4 } define void @vshf_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { - ; CHECK: vshf_v2i64_0: + ; CHECK-LABEL: vshf_v2i64_0: %1 = load <2 x i64>, <2 x i64>* %a ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) @@ -256,11 +242,10 @@ ; CHECK-DAG: st.d [[R3]], 0($4) ret void - ; CHECK: .size vshf_v2i64_0 } define void @vshf_v2i64_1(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { - ; CHECK: vshf_v2i64_1: + ; CHECK-LABEL: vshf_v2i64_1: %1 = load <2 x i64>, <2 x i64>* %a ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) @@ -270,11 +255,10 @@ ; CHECK-DAG: st.d [[R3]], 0($4) ret void - ; CHECK: .size vshf_v2i64_1 } define void @vshf_v2i64_2(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { - ; CHECK: vshf_v2i64_2: + ; CHECK-LABEL: vshf_v2i64_2: %1 = load <2 x i64>, <2 x i64>* %a %2 = load <2 x i64>, <2 x i64>* %b @@ -287,11 +271,10 @@ ; CHECK-DAG: st.d [[R3]], 0($4) ret void - ; CHECK: .size vshf_v2i64_2 } define void @vshf_v2i64_3(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { - ; CHECK: vshf_v2i64_3: + ; CHECK-LABEL: vshf_v2i64_3: %1 = load <2 x i64>, <2 x i64>* %a ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) @@ -307,11 +290,10 @@ ; CHECK-DAG: st.d [[R3]], 0($4) ret void - ; CHECK: .size vshf_v2i64_3 } define void @vshf_v2i64_4(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { - ; CHECK: vshf_v2i64_4: + ; CHECK-LABEL: vshf_v2i64_4: %1 = load <2 x i64>, <2 x i64>* %a ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) @@ -321,11 +303,10 @@ ; CHECK-DAG: st.d [[R3]], 0($4) ret void - ; CHECK: .size vshf_v2i64_4 } define void @shf_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { - ; CHECK: shf_v16i8_0: + ; CHECK-LABEL: shf_v16i8_0: %1 = load <16 x i8>, <16 x i8>* %a ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) @@ -335,11 +316,10 @@ ; CHECK-DAG: st.b [[R3]], 0($4) ret void - ; CHECK: .size shf_v16i8_0 } define void @shf_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { - ; CHECK: shf_v8i16_0: + ; CHECK-LABEL: shf_v8i16_0: %1 = load <8 x i16>, <8 x i16>* %a ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) @@ -349,11 +329,10 @@ ; CHECK-DAG: st.h [[R3]], 0($4) ret void - ; CHECK: .size shf_v8i16_0 } define void @shf_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { - ; CHECK: shf_v4i32_0: + ; CHECK-LABEL: shf_v4i32_0: %1 = load <4 x i32>, <4 x i32>* %a ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) @@ -363,13 +342,12 @@ ; CHECK-DAG: st.w [[R3]], 0($4) ret void - ; CHECK: .size shf_v4i32_0 } ; shf.d does not exist define void @ilvev_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { - ; CHECK: ilvev_v16i8_0: + ; CHECK-LABEL: ilvev_v16i8_0: %1 = load <16 x i8>, <16 x i8>* %a ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) @@ -377,64 +355,177 @@ ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6) %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> - ; CHECK-DAG: ilvev.b [[R3:\$w[0-9]+]], [[R1]], [[R2]] + ; CHECK-DAG: ilvev.b [[R3:\$w[0-9]+]], [[R2]], [[R1]] store <16 x i8> %3, <16 x i8>* %c ; CHECK-DAG: st.b [[R3]], 0($4) ret void - ; CHECK: .size ilvev_v16i8_0 } define void @ilvev_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { - ; CHECK: ilvev_v8i16_0: + ; CHECK-LABEL: ilvev_v8i16_0: %1 = load <8 x i16>, <8 x i16>* %a ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) %2 = load <8 x i16>, <8 x i16>* %b ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6) %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> - ; CHECK-DAG: ilvev.h [[R3:\$w[0-9]+]], [[R1]], [[R2]] + ; CHECK-DAG: ilvev.h [[R3:\$w[0-9]+]], [[R2]], [[R1]] store <8 x i16> %3, <8 x i16>* %c ; CHECK-DAG: st.h [[R3]], 0($4) ret void - ; CHECK: .size ilvev_v8i16_0 } define void @ilvev_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { - ; CHECK: ilvev_v4i32_0: + ; CHECK-LABEL: ilvev_v4i32_0: %1 = load <4 x i32>, <4 x i32>* %a ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) %2 = load <4 x i32>, <4 x i32>* %b ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6) %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> - ; CHECK-DAG: ilvev.w [[R3:\$w[0-9]+]], [[R1]], [[R2]] + ; CHECK-DAG: ilvev.w [[R3:\$w[0-9]+]], [[R2]], [[R1]] store <4 x i32> %3, <4 x i32>* %c ; CHECK-DAG: st.w [[R3]], 0($4) ret void - ; CHECK: .size ilvev_v4i32_0 } define void @ilvev_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { - ; CHECK: ilvev_v2i64_0: + ; CHECK-LABEL: ilvev_v2i64_0: %1 = load <2 x i64>, <2 x i64>* %a ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) %2 = load <2 x i64>, <2 x i64>* %b ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6) %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> - ; CHECK-DAG: ilvev.d [[R3:\$w[0-9]+]], [[R1]], [[R2]] + ; CHECK-DAG: ilvev.d [[R3:\$w[0-9]+]], [[R2]], [[R1]] + store <2 x i64> %3, <2 x i64>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void +} + +; Interleaving one operand with itself. +define void @ilvev_v16i8_1(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { + ; CHECK-LABEL: ilvev_v16i8_1: + + %1 = load <16 x i8>, <16 x i8>* %a + ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) + %2 = load <16 x i8>, <16 x i8>* %b + %3 = shufflevector <16 x i8> %1, <16 x i8> %2, + <16 x i32> + ; CHECK-DAG: ilvev.b [[R3:\$w[0-9]+]], [[R1]], [[R1]] + store <16 x i8> %3, <16 x i8>* %c + ; CHECK-DAG: st.b [[R3]], 0($4) + + ret void +} + +define void @ilvev_v8i16_1(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { + ; CHECK-LABEL: ilvev_v8i16_1: + + %1 = load <8 x i16>, <8 x i16>* %a + ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) + %2 = load <8 x i16>, <8 x i16>* %b + %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> + ; CHECK-DAG: ilvev.h [[R3:\$w[0-9]+]], [[R1]], [[R1]] + store <8 x i16> %3, <8 x i16>* %c + ; CHECK-DAG: st.h [[R3]], 0($4) + + ret void +} + +define void @ilvev_v4i32_1(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { + ; CHECK-LABEL: ilvev_v4i32_1: + + %1 = load <4 x i32>, <4 x i32>* %a + ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) + %2 = load <4 x i32>, <4 x i32>* %b + %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> + ; CHECK-DAG: ilvev.w [[R3:\$w[0-9]+]], [[R1]], [[R1]] + store <4 x i32> %3, <4 x i32>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void +} + +define void @ilvev_v2i64_1(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { + ; CHECK-LABEL: ilvev_v2i64_1: + + %1 = load <2 x i64>, <2 x i64>* %a + ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) + %2 = load <2 x i64>, <2 x i64>* %b + %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> + ; ilvev.d with two identical operands is equivalent to splati.d + ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R1]][0] + store <2 x i64> %3, <2 x i64>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void +} + +define void @ilvev_v16i8_2(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { + ; CHECK-LABEL: ilvev_v16i8_2: + + %1 = load <16 x i8>, <16 x i8>* %a + %2 = load <16 x i8>, <16 x i8>* %b + ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <16 x i8> %1, <16 x i8> %2, + <16 x i32> + ; CHECK-DAG: ilvev.b [[R3:\$w[0-9]+]], [[R2]], [[R2]] + store <16 x i8> %3, <16 x i8>* %c + ; CHECK-DAG: st.b [[R3]], 0($4) + + ret void +} + +define void @ilvev_v8i16_2(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { + ; CHECK-LABEL: ilvev_v8i16_2: + + %1 = load <8 x i16>, <8 x i16>* %a + %2 = load <8 x i16>, <8 x i16>* %b + ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> + ; CHECK-DAG: ilvev.h [[R3:\$w[0-9]+]], [[R2]], [[R2]] + store <8 x i16> %3, <8 x i16>* %c + ; CHECK-DAG: st.h [[R3]], 0($4) + + ret void +} + +define void @ilvev_v4i32_2(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { + ; CHECK-LABEL: ilvev_v4i32_2: + + %1 = load <4 x i32>, <4 x i32>* %a + %2 = load <4 x i32>, <4 x i32>* %b + ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> + ; CHECK-DAG: ilvev.w [[R3:\$w[0-9]+]], [[R2]], [[R2]] + store <4 x i32> %3, <4 x i32>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void +} + +define void @ilvev_v2i64_2(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { + ; CHECK-LABEL: ilvev_v2i64_2: + + %1 = load <2 x i64>, <2 x i64>* %a + %2 = load <2 x i64>, <2 x i64>* %b + ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> + ; ilvev.d with two identical operands is equivalent to splati.d + ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R2]][0] store <2 x i64> %3, <2 x i64>* %c ; CHECK-DAG: st.d [[R3]], 0($4) ret void - ; CHECK: .size ilvev_v2i64_0 } define void @ilvod_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { - ; CHECK: ilvod_v16i8_0: + ; CHECK-LABEL: ilvod_v16i8_0: %1 = load <16 x i8>, <16 x i8>* %a ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) @@ -447,11 +538,10 @@ ; CHECK-DAG: st.b [[R3]], 0($4) ret void - ; CHECK: .size ilvod_v16i8_0 } define void @ilvod_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { - ; CHECK: ilvod_v8i16_0: + ; CHECK-LABEL: ilvod_v8i16_0: %1 = load <8 x i16>, <8 x i16>* %a ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) @@ -463,11 +553,10 @@ ; CHECK-DAG: st.h [[R3]], 0($4) ret void - ; CHECK: .size ilvod_v8i16_0 } define void @ilvod_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { - ; CHECK: ilvod_v4i32_0: + ; CHECK-LABEL: ilvod_v4i32_0: %1 = load <4 x i32>, <4 x i32>* %a ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) @@ -479,11 +568,10 @@ ; CHECK-DAG: st.w [[R3]], 0($4) ret void - ; CHECK: .size ilvod_v4i32_0 } define void @ilvod_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { - ; CHECK: ilvod_v2i64_0: + ; CHECK-LABEL: ilvod_v2i64_0: %1 = load <2 x i64>, <2 x i64>* %a ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) @@ -495,11 +583,126 @@ ; CHECK-DAG: st.d [[R3]], 0($4) ret void - ; CHECK: .size ilvod_v2i64_0 } -define void @ilvl_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { - ; CHECK: ilvl_v16i8_0: +define void @ilvod_v16i8_1(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { + ; CHECK-LABEL: ilvod_v16i8_1: + + %1 = load <16 x i8>, <16 x i8>* %a + ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) + %2 = load <16 x i8>, <16 x i8>* %b + %3 = shufflevector <16 x i8> %1, <16 x i8> %2, + <16 x i32> + ; CHECK-DAG: ilvod.b [[R3:\$w[0-9]+]], [[R1]], [[R1]] + store <16 x i8> %3, <16 x i8>* %c + ; CHECK-DAG: st.b [[R3]], 0($4) + + ret void +} + +define void @ilvod_v8i16_1(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { + ; CHECK-LABEL: ilvod_v8i16_1: + + %1 = load <8 x i16>, <8 x i16>* %a + ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) + %2 = load <8 x i16>, <8 x i16>* %b + %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> + ; CHECK-DAG: ilvod.h [[R3:\$w[0-9]+]], [[R1]], [[R1]] + store <8 x i16> %3, <8 x i16>* %c + ; CHECK-DAG: st.h [[R3]], 0($4) + + ret void +} + +define void @ilvod_v4i32_1(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { + ; CHECK-LABEL: ilvod_v4i32_1: + + %1 = load <4 x i32>, <4 x i32>* %a + ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) + %2 = load <4 x i32>, <4 x i32>* %b + %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> + ; CHECK-DAG: ilvod.w [[R3:\$w[0-9]+]], [[R1]], [[R1]] + store <4 x i32> %3, <4 x i32>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void +} + +define void @ilvod_v2i64_1(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { + ; CHECK-LABEL: ilvod_v2i64_1: + + %1 = load <2 x i64>, <2 x i64>* %a + ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) + %2 = load <2 x i64>, <2 x i64>* %b + %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> + ; ilvod.d with two identical operands is equivalent to splati.d + ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R1]][1] + store <2 x i64> %3, <2 x i64>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void +} + +define void @ilvod_v16i8_2(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { + ; CHECK-LABEL: ilvod_v16i8_2: + + %1 = load <16 x i8>, <16 x i8>* %a + %2 = load <16 x i8>, <16 x i8>* %b + ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <16 x i8> %1, <16 x i8> %2, + <16 x i32> + ; CHECK-DAG: ilvod.b [[R3:\$w[0-9]+]], [[R2]], [[R2]] + store <16 x i8> %3, <16 x i8>* %c + ; CHECK-DAG: st.b [[R3]], 0($4) + + ret void +} + +define void @ilvod_v8i16_2(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { + ; CHECK-LABEL: ilvod_v8i16_2: + + %1 = load <8 x i16>, <8 x i16>* %a + %2 = load <8 x i16>, <8 x i16>* %b + ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> + ; CHECK-DAG: ilvod.h [[R3:\$w[0-9]+]], [[R2]], [[R2]] + store <8 x i16> %3, <8 x i16>* %c + ; CHECK-DAG: st.h [[R3]], 0($4) + + ret void +} + +define void @ilvod_v4i32_2(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { + ; CHECK-LABEL: ilvod_v4i32_2: + + %1 = load <4 x i32>, <4 x i32>* %a + %2 = load <4 x i32>, <4 x i32>* %b + ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> + ; CHECK-DAG: ilvod.w [[R3:\$w[0-9]+]], [[R2]], [[R2]] + store <4 x i32> %3, <4 x i32>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void +} + +define void @ilvod_v2i64_2(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { + ; CHECK-LABEL: ilvod_v2i64_2: + + %1 = load <2 x i64>, <2 x i64>* %a + %2 = load <2 x i64>, <2 x i64>* %b + ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> + ; ilvod.d with two identical operands is equivalent to splati.d + ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R2]][1] + store <2 x i64> %3, <2 x i64>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void +} + +define void @ilvr_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { + ; CHECK-LABEL: ilvr_v16i8_0: %1 = load <16 x i8>, <16 x i8>* %a ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) @@ -507,65 +710,177 @@ ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6) %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> - ; CHECK-DAG: ilvl.b [[R3:\$w[0-9]+]], [[R1]], [[R2]] + ; CHECK-DAG: ilvr.b [[R3:\$w[0-9]+]], [[R2]], [[R1]] store <16 x i8> %3, <16 x i8>* %c ; CHECK-DAG: st.b [[R3]], 0($4) ret void - ; CHECK: .size ilvl_v16i8_0 } -define void @ilvl_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { - ; CHECK: ilvl_v8i16_0: +define void @ilvr_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { + ; CHECK-LABEL: ilvr_v8i16_0: %1 = load <8 x i16>, <8 x i16>* %a ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) %2 = load <8 x i16>, <8 x i16>* %b ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6) %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> - ; CHECK-DAG: ilvl.h [[R3:\$w[0-9]+]], [[R1]], [[R2]] + ; CHECK-DAG: ilvr.h [[R3:\$w[0-9]+]], [[R2]], [[R1]] store <8 x i16> %3, <8 x i16>* %c ; CHECK-DAG: st.h [[R3]], 0($4) ret void - ; CHECK: .size ilvl_v8i16_0 } -define void @ilvl_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { - ; CHECK: ilvl_v4i32_0: +define void @ilvr_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { + ; CHECK-LABEL: ilvr_v4i32_0: %1 = load <4 x i32>, <4 x i32>* %a ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) %2 = load <4 x i32>, <4 x i32>* %b ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6) %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> - ; CHECK-DAG: ilvl.w [[R3:\$w[0-9]+]], [[R1]], [[R2]] + ; CHECK-DAG: ilvr.w [[R3:\$w[0-9]+]], [[R2]], [[R1]] store <4 x i32> %3, <4 x i32>* %c ; CHECK-DAG: st.w [[R3]], 0($4) ret void - ; CHECK: .size ilvl_v4i32_0 } -define void @ilvl_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { - ; CHECK: ilvl_v2i64_0: +define void @ilvr_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { + ; CHECK-LABEL: ilvr_v2i64_0: %1 = load <2 x i64>, <2 x i64>* %a ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) %2 = load <2 x i64>, <2 x i64>* %b ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6) %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> - ; ilvl.d and ilvev.d are equivalent for v2i64 - ; CHECK-DAG: ilvev.d [[R3:\$w[0-9]+]], [[R1]], [[R2]] + ; ilvr.d and ilvev.d are equivalent for v2i64 + ; CHECK-DAG: ilvev.d [[R3:\$w[0-9]+]], [[R2]], [[R1]] store <2 x i64> %3, <2 x i64>* %c ; CHECK-DAG: st.d [[R3]], 0($4) ret void - ; CHECK: .size ilvl_v2i64_0 } -define void @ilvr_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { - ; CHECK: ilvr_v16i8_0: +define void @ilvr_v16i8_1(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { + ; CHECK-LABEL: ilvr_v16i8_1: + + %1 = load <16 x i8>, <16 x i8>* %a + %2 = load <16 x i8>, <16 x i8>* %b + ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <16 x i8> %1, <16 x i8> %2, + <16 x i32> + ; CHECK-DAG: ilvr.b [[R3:\$w[0-9]+]], [[R2]], [[R2]] + store <16 x i8> %3, <16 x i8>* %c + ; CHECK-DAG: st.b [[R3]], 0($4) + + ret void +} + +define void @ilvr_v8i16_1(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { + ; CHECK-LABEL: ilvr_v8i16_1: + + %1 = load <8 x i16>, <8 x i16>* %a + %2 = load <8 x i16>, <8 x i16>* %b + ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> + ; CHECK-DAG: ilvr.h [[R3:\$w[0-9]+]], [[R2]], [[R2]] + store <8 x i16> %3, <8 x i16>* %c + ; CHECK-DAG: st.h [[R3]], 0($4) + + ret void +} + +define void @ilvr_v4i32_1(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { + ; CHECK-LABEL: ilvr_v4i32_1: + + %1 = load <4 x i32>, <4 x i32>* %a + %2 = load <4 x i32>, <4 x i32>* %b + ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> + ; CHECK-DAG: ilvr.w [[R3:\$w[0-9]+]], [[R2]], [[R2]] + store <4 x i32> %3, <4 x i32>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void +} + +define void @ilvr_v2i64_1(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { + ; CHECK-LABEL: ilvr_v2i64_1: + + %1 = load <2 x i64>, <2 x i64>* %a + %2 = load <2 x i64>, <2 x i64>* %b + ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> + ; ilvr.d and splati.d are equivalent for v2i64 + ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R2]][0] + store <2 x i64> %3, <2 x i64>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void +} + +define void @ilvr_v16i8_2(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { + ; CHECK-LABEL: ilvr_v16i8_2: + + %1 = load <16 x i8>, <16 x i8>* %a + ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) + %2 = load <16 x i8>, <16 x i8>* %b + %3 = shufflevector <16 x i8> %1, <16 x i8> %2, + <16 x i32> + ; CHECK-DAG: ilvr.b [[R3:\$w[0-9]+]], [[R1]], [[R1]] + store <16 x i8> %3, <16 x i8>* %c + ; CHECK-DAG: st.b [[R3]], 0($4) + + ret void +} + +define void @ilvr_v8i16_2(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { + ; CHECK-LABEL: ilvr_v8i16_2: + + %1 = load <8 x i16>, <8 x i16>* %a + ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) + %2 = load <8 x i16>, <8 x i16>* %b + %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> + ; CHECK-DAG: ilvr.h [[R3:\$w[0-9]+]], [[R1]], [[R1]] + store <8 x i16> %3, <8 x i16>* %c + ; CHECK-DAG: st.h [[R3]], 0($4) + + ret void +} + +define void @ilvr_v4i32_2(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { + ; CHECK-LABEL: ilvr_v4i32_2: + + %1 = load <4 x i32>, <4 x i32>* %a + ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) + %2 = load <4 x i32>, <4 x i32>* %b + %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> + ; CHECK-DAG: ilvr.w [[R3:\$w[0-9]+]], [[R1]], [[R1]] + store <4 x i32> %3, <4 x i32>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void +} + +define void @ilvr_v2i64_2(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { + ; CHECK-LABEL: ilvr_v2i64_2: + + %1 = load <2 x i64>, <2 x i64>* %a + ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) + %2 = load <2 x i64>, <2 x i64>* %b + %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> + ; ilvr.d and splati.d are equivalent for v2i64 + ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R1]][0] + store <2 x i64> %3, <2 x i64>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void +} + +define void @ilvl_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { + ; CHECK-LABEL: ilvl_v16i8_0: %1 = load <16 x i8>, <16 x i8>* %a ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) @@ -573,65 +888,177 @@ ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6) %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> - ; CHECK-DAG: ilvr.b [[R3:\$w[0-9]+]], [[R1]], [[R2]] + ; CHECK-DAG: ilvl.b [[R3:\$w[0-9]+]], [[R2]], [[R1]] store <16 x i8> %3, <16 x i8>* %c ; CHECK-DAG: st.b [[R3]], 0($4) ret void - ; CHECK: .size ilvr_v16i8_0 } -define void @ilvr_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { - ; CHECK: ilvr_v8i16_0: +define void @ilvl_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { + ; CHECK-LABEL: ilvl_v8i16_0: %1 = load <8 x i16>, <8 x i16>* %a ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) %2 = load <8 x i16>, <8 x i16>* %b ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6) %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> - ; CHECK-DAG: ilvr.h [[R3:\$w[0-9]+]], [[R1]], [[R2]] + ; CHECK-DAG: ilvl.h [[R3:\$w[0-9]+]], [[R2]], [[R1]] store <8 x i16> %3, <8 x i16>* %c ; CHECK-DAG: st.h [[R3]], 0($4) ret void - ; CHECK: .size ilvr_v8i16_0 } -define void @ilvr_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { - ; CHECK: ilvr_v4i32_0: +define void @ilvl_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { + ; CHECK-LABEL: ilvl_v4i32_0: %1 = load <4 x i32>, <4 x i32>* %a ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) %2 = load <4 x i32>, <4 x i32>* %b ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6) %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> - ; CHECK-DAG: ilvr.w [[R3:\$w[0-9]+]], [[R1]], [[R2]] + ; CHECK-DAG: ilvl.w [[R3:\$w[0-9]+]], [[R2]], [[R1]] store <4 x i32> %3, <4 x i32>* %c ; CHECK-DAG: st.w [[R3]], 0($4) ret void - ; CHECK: .size ilvr_v4i32_0 } -define void @ilvr_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { - ; CHECK: ilvr_v2i64_0: +define void @ilvl_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { + ; CHECK-LABEL: ilvl_v2i64_0: %1 = load <2 x i64>, <2 x i64>* %a ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) %2 = load <2 x i64>, <2 x i64>* %b ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6) %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> - ; ilvr.d and ilvod.d are equivalent for v2i64 + ; ilvl.d and ilvod.d are equivalent for v2i64 ; CHECK-DAG: ilvod.d [[R3:\$w[0-9]+]], [[R1]], [[R2]] store <2 x i64> %3, <2 x i64>* %c ; CHECK-DAG: st.d [[R3]], 0($4) ret void - ; CHECK: .size ilvr_v2i64_0 +} + +define void @ilvl_v16i8_1(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { + ; CHECK-LABEL: ilvl_v16i8_1: + + %1 = load <16 x i8>, <16 x i8>* %a + %2 = load <16 x i8>, <16 x i8>* %b + ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <16 x i8> %1, <16 x i8> %2, + <16 x i32> + ; CHECK-DAG: ilvl.b [[R3:\$w[0-9]+]], [[R2]], [[R2]] + store <16 x i8> %3, <16 x i8>* %c + ; CHECK-DAG: st.b [[R3]], 0($4) + + ret void +} + +define void @ilvl_v8i16_1(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { + ; CHECK-LABEL: ilvl_v8i16_1: + + %1 = load <8 x i16>, <8 x i16>* %a + %2 = load <8 x i16>, <8 x i16>* %b + ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> + ; CHECK-DAG: ilvl.h [[R3:\$w[0-9]+]], [[R2]], [[R2]] + store <8 x i16> %3, <8 x i16>* %c + ; CHECK-DAG: st.h [[R3]], 0($4) + + ret void +} + +define void @ilvl_v4i32_1(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { + ; CHECK-LABEL: ilvl_v4i32_1: + + %1 = load <4 x i32>, <4 x i32>* %a + %2 = load <4 x i32>, <4 x i32>* %b + ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> + ; CHECK-DAG: ilvl.w [[R3:\$w[0-9]+]], [[R2]], [[R2]] + store <4 x i32> %3, <4 x i32>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void +} + +define void @ilvl_v2i64_1(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { + ; CHECK-LABEL: ilvl_v2i64_1: + + %1 = load <2 x i64>, <2 x i64>* %a + %2 = load <2 x i64>, <2 x i64>* %b + ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> + ; ilvl.d and splati.d are equivalent for v2i64 + ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R2]][1] + store <2 x i64> %3, <2 x i64>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void +} + +define void @ilvl_v16i8_2(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { + ; CHECK-LABEL: ilvl_v16i8_2: + + %1 = load <16 x i8>, <16 x i8>* %a + ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) + %2 = load <16 x i8>, <16 x i8>* %b + %3 = shufflevector <16 x i8> %1, <16 x i8> %2, + <16 x i32> + ; CHECK-DAG: ilvl.b [[R3:\$w[0-9]+]], [[R1]], [[R1]] + store <16 x i8> %3, <16 x i8>* %c + ; CHECK-DAG: st.b [[R3]], 0($4) + + ret void +} + +define void @ilvl_v8i16_2(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { + ; CHECK-LABEL: ilvl_v8i16_2: + + %1 = load <8 x i16>, <8 x i16>* %a + ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) + %2 = load <8 x i16>, <8 x i16>* %b + %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> + ; CHECK-DAG: ilvl.h [[R3:\$w[0-9]+]], [[R1]], [[R1]] + store <8 x i16> %3, <8 x i16>* %c + ; CHECK-DAG: st.h [[R3]], 0($4) + + ret void +} + +define void @ilvl_v4i32_2(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { + ; CHECK-LABEL: ilvl_v4i32_2: + + %1 = load <4 x i32>, <4 x i32>* %a + ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) + %2 = load <4 x i32>, <4 x i32>* %b + %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> + ; CHECK-DAG: ilvl.w [[R3:\$w[0-9]+]], [[R1]], [[R1]] + store <4 x i32> %3, <4 x i32>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void +} + +define void @ilvl_v2i64_2(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { + ; CHECK-LABEL: ilvl_v2i64_2: + + %1 = load <2 x i64>, <2 x i64>* %a + ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) + %2 = load <2 x i64>, <2 x i64>* %b + %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> + ; ilvl.d and splati.d are equivalent for v2i64 + ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R1]][1] + store <2 x i64> %3, <2 x i64>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void } define void @pckev_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { - ; CHECK: pckev_v16i8_0: + ; CHECK-LABEL: pckev_v16i8_0: %1 = load <16 x i8>, <16 x i8>* %a ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) @@ -639,48 +1066,45 @@ ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6) %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> - ; CHECK-DAG: pckev.b [[R3:\$w[0-9]+]], [[R1]], [[R2]] + ; CHECK-DAG: pckev.b [[R3:\$w[0-9]+]], [[R2]], [[R1]] store <16 x i8> %3, <16 x i8>* %c ; CHECK-DAG: st.b [[R3]], 0($4) ret void - ; CHECK: .size pckev_v16i8_0 } define void @pckev_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { - ; CHECK: pckev_v8i16_0: + ; CHECK-LABEL: pckev_v8i16_0: %1 = load <8 x i16>, <8 x i16>* %a ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) %2 = load <8 x i16>, <8 x i16>* %b ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6) %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> - ; CHECK-DAG: pckev.h [[R3:\$w[0-9]+]], [[R1]], [[R2]] + ; CHECK-DAG: pckev.h [[R3:\$w[0-9]+]], [[R2]], [[R1]] store <8 x i16> %3, <8 x i16>* %c ; CHECK-DAG: st.h [[R3]], 0($4) ret void - ; CHECK: .size pckev_v8i16_0 } define void @pckev_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { - ; CHECK: pckev_v4i32_0: + ; CHECK-LABEL: pckev_v4i32_0: %1 = load <4 x i32>, <4 x i32>* %a ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) %2 = load <4 x i32>, <4 x i32>* %b ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6) %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> - ; CHECK-DAG: pckev.w [[R3:\$w[0-9]+]], [[R1]], [[R2]] + ; CHECK-DAG: pckev.w [[R3:\$w[0-9]+]], [[R2]], [[R1]] store <4 x i32> %3, <4 x i32>* %c ; CHECK-DAG: st.w [[R3]], 0($4) ret void - ; CHECK: .size pckev_v4i32_0 } define void @pckev_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { - ; CHECK: pckev_v2i64_0: + ; CHECK-LABEL: pckev_v2i64_0: %1 = load <2 x i64>, <2 x i64>* %a ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) @@ -688,16 +1112,131 @@ ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6) %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> ; pckev.d and ilvev.d are equivalent for v2i64 - ; CHECK-DAG: ilvev.d [[R3:\$w[0-9]+]], [[R1]], [[R2]] + ; CHECK-DAG: ilvev.d [[R3:\$w[0-9]+]], [[R2]], [[R1]] + store <2 x i64> %3, <2 x i64>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void +} + +define void @pckev_v16i8_1(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { + ; CHECK-LABEL: pckev_v16i8_1: + + %1 = load <16 x i8>, <16 x i8>* %a + %2 = load <16 x i8>, <16 x i8>* %b + ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <16 x i8> %1, <16 x i8> %2, + <16 x i32> + ; CHECK-DAG: pckev.b [[R3:\$w[0-9]+]], [[R2]], [[R2]] + store <16 x i8> %3, <16 x i8>* %c + ; CHECK-DAG: st.b [[R3]], 0($4) + + ret void +} + +define void @pckev_v8i16_1(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { + ; CHECK-LABEL: pckev_v8i16_1: + + %1 = load <8 x i16>, <8 x i16>* %a + %2 = load <8 x i16>, <8 x i16>* %b + ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> + ; CHECK-DAG: pckev.h [[R3:\$w[0-9]+]], [[R2]], [[R2]] + store <8 x i16> %3, <8 x i16>* %c + ; CHECK-DAG: st.h [[R3]], 0($4) + + ret void +} + +define void @pckev_v4i32_1(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { + ; CHECK-LABEL: pckev_v4i32_1: + + %1 = load <4 x i32>, <4 x i32>* %a + %2 = load <4 x i32>, <4 x i32>* %b + ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> + ; CHECK-DAG: pckev.w [[R3:\$w[0-9]+]], [[R2]], [[R2]] + store <4 x i32> %3, <4 x i32>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void +} + +define void @pckev_v2i64_1(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { + ; CHECK-LABEL: pckev_v2i64_1: + + %1 = load <2 x i64>, <2 x i64>* %a + %2 = load <2 x i64>, <2 x i64>* %b + ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> + ; pckev.d and splati.d are equivalent for v2i64 + ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R2]][0] + store <2 x i64> %3, <2 x i64>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void +} + +define void @pckev_v16i8_2(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { + ; CHECK-LABEL: pckev_v16i8_2: + + %1 = load <16 x i8>, <16 x i8>* %a + ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) + %2 = load <16 x i8>, <16 x i8>* %b + %3 = shufflevector <16 x i8> %1, <16 x i8> %2, + <16 x i32> + ; CHECK-DAG: pckev.b [[R3:\$w[0-9]+]], [[R1]], [[R1]] + store <16 x i8> %3, <16 x i8>* %c + ; CHECK-DAG: st.b [[R3]], 0($4) + + ret void +} + +define void @pckev_v8i16_2(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { + ; CHECK-LABEL: pckev_v8i16_2: + + %1 = load <8 x i16>, <8 x i16>* %a + ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) + %2 = load <8 x i16>, <8 x i16>* %b + %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> + ; CHECK-DAG: pckev.h [[R3:\$w[0-9]+]], [[R1]], [[R1]] + store <8 x i16> %3, <8 x i16>* %c + ; CHECK-DAG: st.h [[R3]], 0($4) + + ret void +} + +define void @pckev_v4i32_2(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { + ; CHECK-LABEL: pckev_v4i32_2: + + %1 = load <4 x i32>, <4 x i32>* %a + ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) + %2 = load <4 x i32>, <4 x i32>* %b + %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> + ; CHECK-DAG: pckev.w [[R3:\$w[0-9]+]], [[R1]], [[R1]] + store <4 x i32> %3, <4 x i32>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void +} + +define void @pckev_v2i64_2(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { + ; CHECK-LABEL: pckev_v2i64_2: + + %1 = load <2 x i64>, <2 x i64>* %a + ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) + %2 = load <2 x i64>, <2 x i64>* %b + %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> + ; pckev.d and splati.d are equivalent for v2i64 + ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R1]][0] store <2 x i64> %3, <2 x i64>* %c ; CHECK-DAG: st.d [[R3]], 0($4) ret void - ; CHECK: .size pckev_v2i64_0 } define void @pckod_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { - ; CHECK: pckod_v16i8_0: + ; CHECK-LABEL: pckod_v16i8_0: %1 = load <16 x i8>, <16 x i8>* %a ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) @@ -705,48 +1244,45 @@ ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6) %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> - ; CHECK-DAG: pckod.b [[R3:\$w[0-9]+]], [[R1]], [[R2]] + ; CHECK-DAG: pckod.b [[R3:\$w[0-9]+]], [[R2]], [[R1]] store <16 x i8> %3, <16 x i8>* %c ; CHECK-DAG: st.b [[R3]], 0($4) ret void - ; CHECK: .size pckod_v16i8_0 } define void @pckod_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { - ; CHECK: pckod_v8i16_0: + ; CHECK-LABEL: pckod_v8i16_0: %1 = load <8 x i16>, <8 x i16>* %a ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) %2 = load <8 x i16>, <8 x i16>* %b ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6) %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> - ; CHECK-DAG: pckod.h [[R3:\$w[0-9]+]], [[R1]], [[R2]] + ; CHECK-DAG: pckod.h [[R3:\$w[0-9]+]], [[R2]], [[R1]] store <8 x i16> %3, <8 x i16>* %c ; CHECK-DAG: st.h [[R3]], 0($4) ret void - ; CHECK: .size pckod_v8i16_0 } define void @pckod_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { - ; CHECK: pckod_v4i32_0: + ; CHECK-LABEL: pckod_v4i32_0: %1 = load <4 x i32>, <4 x i32>* %a ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) %2 = load <4 x i32>, <4 x i32>* %b ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6) %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> - ; CHECK-DAG: pckod.w [[R3:\$w[0-9]+]], [[R1]], [[R2]] + ; CHECK-DAG: pckod.w [[R3:\$w[0-9]+]], [[R2]], [[R1]] store <4 x i32> %3, <4 x i32>* %c ; CHECK-DAG: st.w [[R3]], 0($4) ret void - ; CHECK: .size pckod_v4i32_0 } define void @pckod_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { - ; CHECK: pckod_v2i64_0: + ; CHECK-LABEL: pckod_v2i64_0: %1 = load <2 x i64>, <2 x i64>* %a ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) @@ -759,11 +1295,126 @@ ; CHECK-DAG: st.d [[R3]], 0($4) ret void - ; CHECK: .size pckod_v2i64_0 +} + +define void @pckod_v16i8_1(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { + ; CHECK-LABEL: pckod_v16i8_1: + + %1 = load <16 x i8>, <16 x i8>* %a + %2 = load <16 x i8>, <16 x i8>* %b + ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <16 x i8> %1, <16 x i8> %2, + <16 x i32> + ; CHECK-DAG: pckod.b [[R3:\$w[0-9]+]], [[R2]], [[R2]] + store <16 x i8> %3, <16 x i8>* %c + ; CHECK-DAG: st.b [[R3]], 0($4) + + ret void +} + +define void @pckod_v8i16_1(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { + ; CHECK-LABEL: pckod_v8i16_1: + + %1 = load <8 x i16>, <8 x i16>* %a + %2 = load <8 x i16>, <8 x i16>* %b + ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> + ; CHECK-DAG: pckod.h [[R3:\$w[0-9]+]], [[R2]], [[R2]] + store <8 x i16> %3, <8 x i16>* %c + ; CHECK-DAG: st.h [[R3]], 0($4) + + ret void +} + +define void @pckod_v4i32_1(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { + ; CHECK-LABEL: pckod_v4i32_1: + + %1 = load <4 x i32>, <4 x i32>* %a + %2 = load <4 x i32>, <4 x i32>* %b + ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> + ; CHECK-DAG: pckod.w [[R3:\$w[0-9]+]], [[R2]], [[R2]] + store <4 x i32> %3, <4 x i32>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void +} + +define void @pckod_v2i64_1(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { + ; CHECK-LABEL: pckod_v2i64_1: + + %1 = load <2 x i64>, <2 x i64>* %a + %2 = load <2 x i64>, <2 x i64>* %b + ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6) + %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> + ; pckod.d and splati.d are equivalent for v2i64 + ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R2]][1] + store <2 x i64> %3, <2 x i64>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void +} + +define void @pckod_v16i8_2(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind { + ; CHECK-LABEL: pckod_v16i8_2: + + %1 = load <16 x i8>, <16 x i8>* %a + ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) + %2 = load <16 x i8>, <16 x i8>* %b + %3 = shufflevector <16 x i8> %1, <16 x i8> %2, + <16 x i32> + ; CHECK-DAG: pckod.b [[R3:\$w[0-9]+]], [[R1]], [[R1]] + store <16 x i8> %3, <16 x i8>* %c + ; CHECK-DAG: st.b [[R3]], 0($4) + + ret void +} + +define void @pckod_v8i16_2(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind { + ; CHECK-LABEL: pckod_v8i16_2: + + %1 = load <8 x i16>, <8 x i16>* %a + ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) + %2 = load <8 x i16>, <8 x i16>* %b + %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> + ; CHECK-DAG: pckod.h [[R3:\$w[0-9]+]], [[R1]], [[R1]] + store <8 x i16> %3, <8 x i16>* %c + ; CHECK-DAG: st.h [[R3]], 0($4) + + ret void +} + +define void @pckod_v4i32_2(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind { + ; CHECK-LABEL: pckod_v4i32_2: + + %1 = load <4 x i32>, <4 x i32>* %a + ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) + %2 = load <4 x i32>, <4 x i32>* %b + %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> + ; CHECK-DAG: pckod.w [[R3:\$w[0-9]+]], [[R1]], [[R1]] + store <4 x i32> %3, <4 x i32>* %c + ; CHECK-DAG: st.w [[R3]], 0($4) + + ret void +} + +define void @pckod_v2i64_2(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind { + ; CHECK-LABEL: pckod_v2i64_2: + + %1 = load <2 x i64>, <2 x i64>* %a + ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) + %2 = load <2 x i64>, <2 x i64>* %b + %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> + ; pckod.d and splati.d are equivalent for v2i64 + ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R1]][1] + store <2 x i64> %3, <2 x i64>* %c + ; CHECK-DAG: st.d [[R3]], 0($4) + + ret void } define void @splati_v16i8_0(<16 x i8>* %c, <16 x i8>* %a) nounwind { - ; CHECK: splati_v16i8_0: + ; CHECK-LABEL: splati_v16i8_0: %1 = load <16 x i8>, <16 x i8>* %a ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5) @@ -774,11 +1425,10 @@ ; CHECK-DAG: st.b [[R3]], 0($4) ret void - ; CHECK: .size splati_v16i8_0 } define void @splati_v8i16_0(<8 x i16>* %c, <8 x i16>* %a) nounwind { - ; CHECK: splati_v8i16_0: + ; CHECK-LABEL: splati_v8i16_0: %1 = load <8 x i16>, <8 x i16>* %a ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5) @@ -788,26 +1438,23 @@ ; CHECK-DAG: st.h [[R3]], 0($4) ret void - ; CHECK: .size splati_v8i16_0 } define void @splati_v4i32_0(<4 x i32>* %c, <4 x i32>* %a) nounwind { - ; CHECK: splati_v4i32_0: + ; CHECK-LABEL: splati_v4i32_0: %1 = load <4 x i32>, <4 x i32>* %a ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5) %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> - ; shf.w and splati.w are equivalent - ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 255 + ; CHECK-DAG: splati.w [[R3:\$w[0-9]+]], [[R1]][3] store <4 x i32> %2, <4 x i32>* %c ; CHECK-DAG: st.w [[R3]], 0($4) ret void - ; CHECK: .size splati_v4i32_0 } define void @splati_v2i64_0(<2 x i64>* %c, <2 x i64>* %a) nounwind { - ; CHECK: splati_v2i64_0: + ; CHECK-LABEL: splati_v2i64_0: %1 = load <2 x i64>, <2 x i64>* %a ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5) @@ -817,5 +1464,4 @@ ; CHECK-DAG: st.d [[R3]], 0($4) ret void - ; CHECK: .size splati_v2i64_0 }