Index: llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h +++ llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h @@ -1080,6 +1080,11 @@ /// from one vector into the other. SDValue lowerToVINSERTH(ShuffleVectorSDNode *N, SelectionDAG &DAG) const; + /// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be + /// handled by the VINSERTB instruction introduced in ISA 3.0. This is + /// essentially v16i8 vector version of VINSERTH. + SDValue lowerToVINSERTB(ShuffleVectorSDNode *N, SelectionDAG &DAG) const; + }; // end class PPCTargetLowering namespace PPC { Index: llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp +++ llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp @@ -7890,6 +7890,107 @@ return DAG.getNode(ISD::BITCAST, dl, VT, T); } +/// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be handled +/// by the VINSERTB instruction introduced in ISA 3.0, else just return default +/// SDValue. +SDValue PPCTargetLowering::lowerToVINSERTB(ShuffleVectorSDNode *N, + SelectionDAG &DAG) const { + const unsigned BytesInVector = 16; + bool IsLE = Subtarget.isLittleEndian(); + SDLoc dl(N); + SDValue V1 = N->getOperand(0); + SDValue V2 = N->getOperand(1); + unsigned ShiftElts = 0, InsertAtByte = 0; + bool Swap = false; + + // Shifts required to get the byte we want at element 7. + unsigned LittleEndianShifts[] = {8, 7, 6, 5, 4, 3, 2, 1, + 0, 15, 14, 13, 12, 11, 10, 9}; + unsigned BigEndianShifts[] = {9, 10, 11, 12, 13, 14, 15, 0, + 1, 2, 3, 4, 5, 6, 7, 8}; + + ArrayRef Mask = N->getMask(); + int OriginalOrder[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + // For each mask element, find out if we're just inserting something + // from V2 into V1 or vice versa. + // Possible permutations inserting an element from V2 into V1: + // X, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + // 0, X, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + // ... + // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, X + // Inserting from V1 into V2 will be similar, except mask range will be + // [16,31]. + + bool FoundCandidate = false; + // If both vector operands for the shuffle are the same vector, the mask + // will contain only elements from the first one and the second one will be + // undef. + unsigned VINSERTBSrcElem = IsLE ? 8 : 7; + // Go through the mask of half-words to find an element that's being moved + // from one vector to the other. + for (unsigned i = 0; i < BytesInVector; ++i) { + unsigned CurrentElement = Mask[i]; + // If 2nd operand is undefined, we should only look for element 7 in the + // Mask. + if (V2.isUndef() && CurrentElement != VINSERTBSrcElem) + continue; + + bool OtherElementsInOrder = true; + // Examine the other elements in the Mask to see if they're in original + // order. + for (unsigned j = 0; j < BytesInVector; ++j) { + if (j == i) + continue; + // If CurrentElement is from V1 [0,15], then we the rest of the Mask to be + // from V2 [16,31] and vice versa. Unless the 2nd operand is undefined, + // in which we always assume we're always picking from the 1st operand. + int MaskOffset = + (!V2.isUndef() && CurrentElement < BytesInVector) ? BytesInVector : 0; + if (Mask[j] != OriginalOrder[j] + MaskOffset) { + OtherElementsInOrder = false; + break; + } + } + // If other elements are in original order, we record the number of shifts + // we need to get the element we want into element 7. Also record which byte + // in the vector we should insert into. + if (OtherElementsInOrder) { + // If 2nd operand is undefined, we assume no shifts and no swapping. + if (V2.isUndef()) { + ShiftElts = 0; + Swap = false; + } else { + // Only need the last 4-bits for shifts because operands will be swapped if CurrentElement is >= 2^4. + ShiftElts = IsLE ? LittleEndianShifts[CurrentElement & 0xF] + : BigEndianShifts[CurrentElement & 0xF]; + Swap = CurrentElement < BytesInVector; + } + InsertAtByte = IsLE ? BytesInVector - (i + 1) : i; + FoundCandidate = true; + break; + } + } + + if (!FoundCandidate) + return SDValue(); + + // Candidate found, construct the proper SDAG sequence with VINSERTB, + // optionally with VECSHL if shift is required. + if (Swap) + std::swap(V1, V2); + if (V2.isUndef()) + V2 = V1; + if (ShiftElts) { + SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2, + DAG.getConstant(ShiftElts, dl, MVT::i32)); + return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, Shl, + DAG.getConstant(InsertAtByte, dl, MVT::i32)); + } + return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, V2, + DAG.getConstant(InsertAtByte, dl, MVT::i32)); +} + /// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be handled /// by the VINSERTH instruction introduced in ISA 3.0, else just return default /// SDValue. @@ -8037,8 +8138,11 @@ } if (Subtarget.hasP9Altivec()) { - SDValue NewISDNode = lowerToVINSERTH(SVOp, DAG); - if (NewISDNode) + SDValue NewISDNode; + if (NewISDNode = lowerToVINSERTH(SVOp, DAG)) + return NewISDNode; + + if (NewISDNode = lowerToVINSERTB(SVOp, DAG)) return NewISDNode; } Index: llvm/trunk/lib/Target/PowerPC/PPCInstrAltivec.td =================================================================== --- llvm/trunk/lib/Target/PowerPC/PPCInstrAltivec.td +++ llvm/trunk/lib/Target/PowerPC/PPCInstrAltivec.td @@ -1312,7 +1312,12 @@ def VEXTUWRX : VX1_RT5_RA5_VB5<1933, "vextuwrx", []>; // Vector Insert Element Instructions -def VINSERTB : VX1_VT5_UIM5_VB5<781, "vinsertb", []>; +def VINSERTB : VXForm_1<781, (outs vrrc:$vD), + (ins vrrc:$vDi, u4imm:$UIM, vrrc:$vB), + "vinsertb $vD, $vB, $UIM", IIC_VecGeneral, + [(set v16i8:$vD, (PPCvecinsert v16i8:$vDi, v16i8:$vB, + imm32SExt16:$UIM))]>, + RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">; def VINSERTH : VXForm_1<845, (outs vrrc:$vD), (ins vrrc:$vDi, u4imm:$UIM, vrrc:$vB), "vinserth $vD, $vB, $UIM", IIC_VecGeneral, Index: llvm/trunk/test/CodeGen/PowerPC/p9-vinsert-vextract.ll =================================================================== --- llvm/trunk/test/CodeGen/PowerPC/p9-vinsert-vextract.ll +++ llvm/trunk/test/CodeGen/PowerPC/p9-vinsert-vextract.ll @@ -298,3 +298,581 @@ ret <8 x i16> %vecins } +; The following testcases take one byte element from the second vector and +; inserts it at various locations in the first vector +define <16 x i8> @shuffle_vector_byte_0_16(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_0_16 +; CHECK: vsldoi 3, 3, 3, 8 +; CHECK: vinsertb 2, 3, 15 +; CHECK-BE-LABEL: shuffle_vector_byte_0_16 +; CHECK-BE: vsldoi 3, 3, 3, 9 +; CHECK-BE: vinsertb 2, 3, 0 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_1_25(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_1_25 +; CHECK: vsldoi 3, 3, 3, 15 +; CHECK: vinsertb 2, 3, 14 +; CHECK-BE-LABEL: shuffle_vector_byte_1_25 +; CHECK-BE: vsldoi 3, 3, 3, 2 +; CHECK-BE: vinsertb 2, 3, 1 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_2_18(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_2_18 +; CHECK: vsldoi 3, 3, 3, 6 +; CHECK: vinsertb 2, 3, 13 +; CHECK-BE-LABEL: shuffle_vector_byte_2_18 +; CHECK-BE: vsldoi 3, 3, 3, 11 +; CHECK-BE: vinsertb 2, 3, 2 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_3_27(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_3_27 +; CHECK: vsldoi 3, 3, 3, 13 +; CHECK: vinsertb 2, 3, 12 +; CHECK-BE-LABEL: shuffle_vector_byte_3_27 +; CHECK-BE: vsldoi 3, 3, 3, 4 +; CHECK-BE: vinsertb 2, 3, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_4_20(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_4_20 +; CHECK: vsldoi 3, 3, 3, 4 +; CHECK: vinsertb 2, 3, 11 +; CHECK-BE-LABEL: shuffle_vector_byte_4_20 +; CHECK-BE: vsldoi 3, 3, 3, 13 +; CHECK-BE: vinsertb 2, 3, 4 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_5_29(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_5_29 +; CHECK: vsldoi 3, 3, 3, 11 +; CHECK: vinsertb 2, 3, 10 +; CHECK-BE-LABEL: shuffle_vector_byte_5_29 +; CHECK-BE: vsldoi 3, 3, 3, 6 +; CHECK-BE: vinsertb 2, 3, 5 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_6_22(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_6_22 +; CHECK: vsldoi 3, 3, 3, 2 +; CHECK: vinsertb 2, 3, 9 +; CHECK-BE-LABEL: shuffle_vector_byte_6_22 +; CHECK-BE: vsldoi 3, 3, 3, 15 +; CHECK-BE: vinsertb 2, 3, 6 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_7_31(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_7_31 +; CHECK: vsldoi 3, 3, 3, 9 +; CHECK: vinsertb 2, 3, 8 +; CHECK-BE-LABEL: shuffle_vector_byte_7_31 +; CHECK-BE: vsldoi 3, 3, 3, 8 +; CHECK-BE: vinsertb 2, 3, 7 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_8_24(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_8_24 +; CHECK: vinsertb 2, 3, 7 +; CHECK-BE-LABEL: shuffle_vector_byte_8_24 +; CHECK-BE: vsldoi 3, 3, 3, 1 +; CHECK-BE: vinsertb 2, 3, 8 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_9_17(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_9_17 +; CHECK: vsldoi 3, 3, 3, 7 +; CHECK: vinsertb 2, 3, 6 +; CHECK-BE-LABEL: shuffle_vector_byte_9_17 +; CHECK-BE: vsldoi 3, 3, 3, 10 +; CHECK-BE: vinsertb 2, 3, 9 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_10_26(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_10_26 +; CHECK: vsldoi 3, 3, 3, 14 +; CHECK: vinsertb 2, 3, 5 +; CHECK-BE-LABEL: shuffle_vector_byte_10_26 +; CHECK-BE: vsldoi 3, 3, 3, 3 +; CHECK-BE: vinsertb 2, 3, 10 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_11_19(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_11_19 +; CHECK: vsldoi 3, 3, 3, 5 +; CHECK: vinsertb 2, 3, 4 +; CHECK-BE-LABEL: shuffle_vector_byte_11_19 +; CHECK-BE: vsldoi 3, 3, 3, 12 +; CHECK-BE: vinsertb 2, 3, 11 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_12_28(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_12_28 +; CHECK: vsldoi 3, 3, 3, 12 +; CHECK: vinsertb 2, 3, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_12_28 +; CHECK-BE: vsldoi 3, 3, 3, 5 +; CHECK-BE: vinsertb 2, 3, 12 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_13_21(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_13_21 +; CHECK: vsldoi 3, 3, 3, 3 +; CHECK: vinsertb 2, 3, 2 +; CHECK-BE-LABEL: shuffle_vector_byte_13_21 +; CHECK-BE: vsldoi 3, 3, 3, 14 +; CHECK-BE: vinsertb 2, 3, 13 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_14_30(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_14_30 +; CHECK: vsldoi 3, 3, 3, 10 +; CHECK: vinsertb 2, 3, 1 +; CHECK-BE-LABEL: shuffle_vector_byte_14_30 +; CHECK-BE: vsldoi 3, 3, 3, 7 +; CHECK-BE: vinsertb 2, 3, 14 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_15_23(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_15_23 +; CHECK: vsldoi 3, 3, 3, 1 +; CHECK: vinsertb 2, 3, 0 +; CHECK-BE-LABEL: shuffle_vector_byte_15_23 +; CHECK-BE: vinsertb 2, 3, 15 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +; The following testcases take one byte element from the first vector and +; inserts it at various locations in the second vector +define <16 x i8> @shuffle_vector_byte_16_8(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_16_8 +; CHECK: vinsertb 3, 2, 15 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_16_8 +; CHECK-BE: vsldoi 2, 2, 2, 1 +; CHECK-BE: vinsertb 3, 2, 0 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_17_1(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_17_1 +; CHECK: vsldoi 2, 2, 2, 7 +; CHECK: vinsertb 3, 2, 14 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_17_1 +; CHECK-BE: vsldoi 2, 2, 2, 10 +; CHECK-BE: vinsertb 3, 2, 1 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_18_10(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_18_10 +; CHECK: vsldoi 2, 2, 2, 14 +; CHECK: vinsertb 3, 2, 13 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_18_10 +; CHECK-BE: vsldoi 2, 2, 2, 3 +; CHECK-BE: vinsertb 3, 2, 2 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_19_3(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_19_3 +; CHECK: vsldoi 2, 2, 2, 5 +; CHECK: vinsertb 3, 2, 12 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_19_3 +; CHECK-BE: vsldoi 2, 2, 2, 12 +; CHECK-BE: vinsertb 3, 2, 3 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_20_12(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_20_12 +; CHECK: vsldoi 2, 2, 2, 12 +; CHECK: vinsertb 3, 2, 11 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_20_12 +; CHECK-BE: vsldoi 2, 2, 2, 5 +; CHECK-BE: vinsertb 3, 2, 4 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_21_5(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_21_5 +; CHECK: vsldoi 2, 2, 2, 3 +; CHECK: vinsertb 3, 2, 10 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_21_5 +; CHECK-BE: vsldoi 2, 2, 2, 14 +; CHECK-BE: vinsertb 3, 2, 5 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_22_14(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_22_14 +; CHECK: vsldoi 2, 2, 2, 10 +; CHECK: vinsertb 3, 2, 9 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_22_14 +; CHECK-BE: vsldoi 2, 2, 2, 7 +; CHECK-BE: vinsertb 3, 2, 6 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_23_7(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_23_7 +; CHECK: vsldoi 2, 2, 2, 1 +; CHECK: vinsertb 3, 2, 8 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_23_7 +; CHECK-BE: vinsertb 3, 2, 7 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_24_0(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_24_0 +; CHECK: vsldoi 2, 2, 2, 8 +; CHECK: vinsertb 3, 2, 7 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_24_0 +; CHECK-BE: vsldoi 2, 2, 2, 9 +; CHECK-BE: vinsertb 3, 2, 8 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_25_9(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_25_9 +; CHECK: vsldoi 2, 2, 2, 15 +; CHECK: vinsertb 3, 2, 6 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_25_9 +; CHECK-BE: vsldoi 2, 2, 2, 2 +; CHECK-BE: vinsertb 3, 2, 9 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_26_2(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_26_2 +; CHECK: vsldoi 2, 2, 2, 6 +; CHECK: vinsertb 3, 2, 5 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_26_2 +; CHECK-BE: vsldoi 2, 2, 2, 11 +; CHECK-BE: vinsertb 3, 2, 10 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_27_11(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_27_11 +; CHECK: vsldoi 2, 2, 2, 13 +; CHECK: vinsertb 3, 2, 4 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_27_11 +; CHECK-BE: vsldoi 2, 2, 2, 4 +; CHECK-BE: vinsertb 3, 2, 11 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_28_4(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_28_4 +; CHECK: vsldoi 2, 2, 2, 4 +; CHECK: vinsertb 3, 2, 3 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_28_4 +; CHECK-BE: vsldoi 2, 2, 2, 13 +; CHECK-BE: vinsertb 3, 2, 12 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_29_13(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_29_13 +; CHECK: vsldoi 2, 2, 2, 11 +; CHECK: vinsertb 3, 2, 2 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_29_13 +; CHECK-BE: vsldoi 2, 2, 2, 6 +; CHECK-BE: vinsertb 3, 2, 13 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_30_6(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_30_6 +; CHECK: vsldoi 2, 2, 2, 2 +; CHECK: vinsertb 3, 2, 1 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_30_6 +; CHECK-BE: vsldoi 2, 2, 2, 15 +; CHECK-BE: vinsertb 3, 2, 14 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_31_15(<16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: shuffle_vector_byte_31_15 +; CHECK: vsldoi 2, 2, 2, 9 +; CHECK: vinsertb 3, 2, 0 +; CHECK: vmr 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_31_15 +; CHECK-BE: vsldoi 2, 2, 2, 8 +; CHECK-BE: vinsertb 3, 2, 15 +; CHECK-BE: vmr 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + ret <16 x i8> %vecins +} + +; The following testcases use the same vector in both arguments of the +; shufflevector. If byte element 7 in BE mode(or 8 in LE mode) is the one +; we're attempting to insert, then we can use the vector insert instruction +define <16 x i8> @shuffle_vector_byte_0_7(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_0_7 +; CHECK-NOT: vinsertb +; CHECK-BE-LABEL: shuffle_vector_byte_0_7 +; CHECK-BE: vinsertb 2, 2, 0 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_1_8(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_1_8 +; CHECK: vinsertb 2, 2, 14 +; CHECK-BE-LABEL: shuffle_vector_byte_1_8 +; CHECK-BE-NOT: vinsertb + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_2_8(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_2_8 +; CHECK: vinsertb 2, 2, 13 +; CHECK-BE-LABEL: shuffle_vector_byte_2_8 +; CHECK-BE-NOT: vinsertb + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_3_7(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_3_7 +; CHECK-NOT: vinsertb +; CHECK-BE-LABEL: shuffle_vector_byte_3_7 +; CHECK-BE: vinsertb 2, 2, 3 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_4_7(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_4_7 +; CHECK-NOT: vinsertb +; CHECK-BE-LABEL: shuffle_vector_byte_4_7 +; CHECK-BE: vinsertb 2, 2, 4 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_5_8(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_5_8 +; CHECK: vinsertb 2, 2, 10 +; CHECK-BE-LABEL: shuffle_vector_byte_5_8 +; CHECK-BE-NOT: vinsertb + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_6_8(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_6_8 +; CHECK: vinsertb 2, 2, 9 +; CHECK-BE-LABEL: shuffle_vector_byte_6_8 +; CHECK-BE-NOT: vinsertb + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_7_8(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_7_8 +; CHECK: vinsertb 2, 2, 8 +; CHECK-BE-LABEL: shuffle_vector_byte_7_8 +; CHECK-BE-NOT: vinsertb + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_8_7(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_8_7 +; CHECK-NOT: vinsertb +; CHECK-BE-LABEL: shuffle_vector_byte_8_7 +; CHECK-BE: vinsertb 2, 2, 8 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_9_7(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_9_7 +; CHECK-NOT: vinsertb +; CHECK-BE-LABEL: shuffle_vector_byte_9_7 +; CHECK-BE: vinsertb 2, 2, 9 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_10_7(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_10_7 +; CHECK-NOT: vinsertb +; CHECK-BE-LABEL: shuffle_vector_byte_10_7 +; CHECK-BE: vinsertb 2, 2, 10 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_11_8(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_11_8 +; CHECK: vinsertb 2, 2, 4 +; CHECK-BE-LABEL: shuffle_vector_byte_11_8 +; CHECK-BE-NOT: vinsertb + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_12_8(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_12_8 +; CHECK: vinsertb 2, 2, 3 +; CHECK-BE-LABEL: shuffle_vector_byte_12_8 +; CHECK-BE-NOT: vinsertb + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_13_7(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_13_7 +; CHECK-NOT: vinsertb +; CHECK-BE-LABEL: shuffle_vector_byte_13_7 +; CHECK-BE: vinsertb 2, 2, 13 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_14_7(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_14_7 +; CHECK-NOT: vinsertb +; CHECK-BE-LABEL: shuffle_vector_byte_14_7 +; CHECK-BE: vinsertb 2, 2, 14 + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %vecins +} + +define <16 x i8> @shuffle_vector_byte_15_8(<16 x i8> %a) { +entry: +; CHECK-LABEL: shuffle_vector_byte_15_8 +; CHECK: vinsertb 2, 2, 0 +; CHECK-BE-LABEL: shuffle_vector_byte_15_8 +; CHECK-BE-NOT: vinsertb + %vecins = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> + ret <16 x i8> %vecins +}