Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -134,13 +134,13 @@ // Vector compare bitwise test NEON_TST, - // Operation for the immediate in vector shift - NEON_DUPIMM, - // Vector saturating shift NEON_QSHLs, NEON_QSHLu, + // Vector dup + NEON_VDUP, + // Vector dup by lane NEON_VDUPLANE }; Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -72,6 +72,7 @@ addRegisterClass(MVT::v8i16, &AArch64::FPR128RegClass); addRegisterClass(MVT::v4i32, &AArch64::FPR128RegClass); addRegisterClass(MVT::v2i64, &AArch64::FPR128RegClass); + addRegisterClass(MVT::v8f16, &AArch64::FPR128RegClass); addRegisterClass(MVT::v4f32, &AArch64::FPR128RegClass); addRegisterClass(MVT::v2f64, &AArch64::FPR128RegClass); } @@ -297,18 +298,28 @@ setOperationAction(ISD::BUILD_VECTOR, MVT::v1f64, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i32, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1f64, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f16, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Legal); @@ -866,8 +877,6 @@ return "AArch64ISD::NEON_CMPZ"; case AArch64ISD::NEON_TST: return "AArch64ISD::NEON_TST"; - case AArch64ISD::NEON_DUPIMM: - return "AArch64ISD::NEON_DUPIMM"; case AArch64ISD::NEON_QSHLs: return "AArch64ISD::NEON_QSHLs"; case AArch64ISD::NEON_QSHLu: @@ -3342,7 +3351,7 @@ case ISD::SHL: if (isVShiftLImm(N->getOperand(1), VT, Cnt)) { SDValue RHS = - DAG.getNode(AArch64ISD::NEON_DUPIMM, SDLoc(N->getOperand(1)), VT, + DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT, DAG.getConstant(Cnt, MVT::i32)); return DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), RHS); } @@ -3352,7 +3361,7 @@ case ISD::SRL: if (isVShiftRImm(N->getOperand(1), VT, Cnt)) { SDValue RHS = - DAG.getNode(AArch64ISD::NEON_DUPIMM, SDLoc(N->getOperand(1)), VT, + DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT, DAG.getConstant(Cnt, MVT::i32)); return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N->getOperand(0), RHS); } @@ -3492,6 +3501,118 @@ } } } + + unsigned NumElts = VT.getVectorNumElements(); + bool isOnlyLowElement = true; + bool usesOnlyOneValue = true; + bool hasDominantValue = false; + bool isConstant = true; + + // Map of the number of times a particular SDValue appears in the + // element list. + DenseMap ValueCounts; + SDValue Value; + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + if (i > 0) + isOnlyLowElement = false; + if (!isa(V) && !isa(V)) + isConstant = false; + + ValueCounts.insert(std::make_pair(V, 0)); + unsigned &Count = ValueCounts[V]; + + // Is this value dominant? (takes up more than half of the lanes) + if (++Count > (NumElts / 2)) { + hasDominantValue = true; + Value = V; + } + } + if (ValueCounts.size() != 1) + usesOnlyOneValue = false; + if (!Value.getNode() && ValueCounts.size() > 0) + Value = ValueCounts.begin()->first; + + if (ValueCounts.size() == 0) + return DAG.getUNDEF(VT); + + // Loads are better lowered with insert_vector_elt. + // Keep going if we are hitting this case. + if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) + return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value); + + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + // Use VDUP for non-constant splats. + if (hasDominantValue && EltSize <= 64) { + if (!isConstant) { + SDValue N; + + // If we are DUPing a value that comes directly from a vector, we could + // just use DUPLANE. We can only do this if the lane being extracted + // is at a constant index, as the DUP from lane instructions only have + // constant-index forms. + if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + isa(Value->getOperand(1))) { + N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT, + Value->getOperand(0), Value->getOperand(1)); + } else + N = DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value); + + if (!usesOnlyOneValue) { + // The dominant value was splatted as 'N', but we now have to insert + // all differing elements. + for (unsigned I = 0; I < NumElts; ++I) { + if (Op.getOperand(I) == Value) + continue; + SmallVector Ops; + Ops.push_back(N); + Ops.push_back(Op.getOperand(I)); + Ops.push_back(DAG.getConstant(I, MVT::i32)); + N = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, &Ops[0], 3); + } + } + return N; + } + if (VT.getVectorElementType().isFloatingPoint()) { + SmallVector Ops; + for (unsigned i = 0; i < NumElts; ++i) + Ops.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i32, + Op.getOperand(i))); + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); + SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, &Ops[0], NumElts); + Val = LowerBUILD_VECTOR(Val, DAG, ST); + if (Val.getNode()) + return DAG.getNode(ISD::BITCAST, DL, VT, Val); + } + if (usesOnlyOneValue && isConstant) { + return DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value); + } + } + // If all elements are constants and the case above didn't get hit, fall back + // to the default expansion, which will generate a load from the constant + // pool. + if (isConstant) + return SDValue(); + + // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we + // know the default expansion would otherwise fall back on something even + // worse. For a vector with one or two non-undef values, that's + // scalar_to_vector for the elements followed by a shuffle (provided the + // shuffle is valid for the target) and materialization element by element + // on the stack followed by a load for everything else. + if (!isConstant && !usesOnlyOneValue) { + SDValue Vec = DAG.getUNDEF(VT); + for (unsigned i = 0 ; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + SDValue LaneIdx = DAG.getConstant(i, MVT::i32); + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx); + } + return Vec; + } return SDValue(); } @@ -3499,6 +3620,7 @@ AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); SDLoc dl(Op); EVT VT = Op.getValueType(); ShuffleVectorSDNode *SVN = cast(Op.getNode()); @@ -3515,10 +3637,90 @@ int Lane = SVN->getSplatIndex(); // If this is undef splat, generate it via "just" vdup, if possible. if (Lane == -1) Lane = 0; - + // Test if V1 is a BUILD_VECTOR + if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR && + !isa(V1.getOperand(0))) { + bool IsScalarToVector = true; + for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) + if (V1.getOperand(i).getOpcode() != ISD::UNDEF) { + IsScalarToVector = false; + break; + } + if (IsScalarToVector) + return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT, V1.getOperand(0)); + } return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1, DAG.getConstant(Lane, MVT::i64)); } + // For shuffle mask like "0, 1, 2, 3, 4, 5, 13, 7", try to generate insert + // by element from V2 to V1 . + // If shuffle mask is like "0, 1, 10, 11, 12, 13, 14, 15", V2 would be a + // better choice to be inserted than V1 as less insert needed, so we count + // element to be inserted for both V1 and V2, and select less one as insert + // target. + + // Ins by element require the input vector holds the same element number as + // output. + //Collect elements need to be inserted and their index. + SmallVector NV1Elt; + SmallVector N1Index; + SmallVector NV2Elt; + SmallVector N2Index; + int Length = ShuffleMask.size(); + for (int Maskindex = 0; Maskindex != Length; ++Maskindex) { + if (ShuffleMask[Maskindex] != Maskindex) { + NV1Elt.push_back(ShuffleMask[Maskindex]); + N1Index.push_back(Maskindex); + } + } + for (int Maskindex = 0; Maskindex != Length; ++Maskindex) { + if (ShuffleMask[Maskindex] != (Maskindex + Length)) { + NV2Elt.push_back(ShuffleMask[Maskindex]); + N2Index.push_back(Maskindex); + } + } + + // If all lanes mismatch, neither V1 nor V2 will be inserted. + bool IsV1Inserted = true; + bool IsV2Inserted = true; + if (Length - NV1Elt.size() < 1) + IsV1Inserted = false; + if (Length - NV2Elt.size() < 1) + IsV2Inserted = false; + + //Decide which to be inserted. + SDValue InsV = V1; + SmallVector InsArray = NV1Elt; + SmallVector InsIndex = N1Index; + if (IsV1Inserted || IsV2Inserted) { + if (NV1Elt.size() > NV2Elt.size()) { + InsV = V2; + InsArray = NV2Elt; + InsIndex = N2Index; + } + } + else + InsV = DAG.getNode(ISD::UNDEF, dl, VT); + + SDValue PassN; + int V1EltNum = V1.getValueType().getVectorNumElements(); + for (int InsertNum = 0, Index = (NV1Elt.size() > NV2Elt.size()) ? + NV2Elt.size() : NV1Elt.size(); + InsertNum != Index; ++InsertNum) { + SDValue ExtV = V1; + if (InsArray[InsertNum] > V1EltNum) { + ExtV = V2; + InsArray[InsertNum] -= V1EltNum; + } + EVT EltVT = MVT::i32; + if(EltSize == 64) + EltVT = MVT::i64; + PassN = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ExtV, + DAG.getConstant(InsArray[InsertNum], MVT::i64)); + PassN = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, InsV, PassN, + DAG.getConstant(InsIndex[InsertNum], MVT::i64)); + } + return PassN; } return SDValue(); Index: lib/Target/AArch64/AArch64InstrNEON.td =================================================================== --- lib/Target/AArch64/AArch64InstrNEON.td +++ lib/Target/AArch64/AArch64InstrNEON.td @@ -41,14 +41,13 @@ def Neon_tst : SDNode<"AArch64ISD::NEON_TST", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<1, 2>]>>; -def Neon_dupImm : SDNode<"AArch64ISD::NEON_DUPIMM", SDTypeProfile<1, 1, - [SDTCisVec<0>, SDTCisVT<1, i32>]>>; - def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisVT<2, i32>]>; def Neon_sqrshlImm : SDNode<"AArch64ISD::NEON_QSHLs", SDTARMVSH>; def Neon_uqrshlImm : SDNode<"AArch64ISD::NEON_QSHLu", SDTARMVSH>; +def Neon_vdup : SDNode<"AArch64ISD::NEON_VDUP", SDTypeProfile<1, 1, + [SDTCisVec<0>]>>; def Neon_vduplane : SDNode<"AArch64ISD::NEON_VDUPLANE", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisVT<2, i64>]>>; @@ -1480,7 +1479,7 @@ asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm", [(set (Ty VPRC:$Rd), (Ty (OpNode (Ty VPRC:$Rn), - (Ty (Neon_dupImm (i32 imm:$Imm))))))], + (Ty (Neon_vdup (i32 imm:$Imm))))))], NoItinerary>; multiclass NeonI_N2VShL opcode, string asmop> { @@ -1585,7 +1584,7 @@ [(set (DestTy VPR128:$Rd), (DestTy (shl (DestTy (ExtOp (SrcTy VPR64:$Rn))), - (DestTy (Neon_dupImm (i32 imm:$Imm))))))], + (DestTy (Neon_vdup (i32 imm:$Imm))))))], NoItinerary>; class N2VShiftLongHigh opcode, string asmop, string DestT, @@ -1599,7 +1598,7 @@ (DestTy (shl (DestTy (ExtOp (SrcTy (getTop VPR128:$Rn)))), - (DestTy (Neon_dupImm (i32 imm:$Imm))))))], + (DestTy (Neon_vdup (i32 imm:$Imm))))))], NoItinerary>; multiclass NeonI_N2VShLL opcode, string asmop, @@ -1771,7 +1770,7 @@ asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm", [(set (Ty VPRC:$Rd), (Ty (add (Ty VPRC:$src), (Ty (OpNode (Ty VPRC:$Rn), - (Ty (Neon_dupImm (i32 imm:$Imm))))))))], + (Ty (Neon_vdup (i32 imm:$Imm))))))))], NoItinerary> { let Constraints = "$src = $Rd"; } @@ -2048,48 +2047,48 @@ def Neon_lshrImm8H : PatFrag<(ops node:$lhs, node:$rhs), (v8i16 (srl (v8i16 node:$lhs), - (v8i16 (Neon_dupImm (i32 node:$rhs)))))>; + (v8i16 (Neon_vdup (i32 node:$rhs)))))>; def Neon_lshrImm4S : PatFrag<(ops node:$lhs, node:$rhs), (v4i32 (srl (v4i32 node:$lhs), - (v4i32 (Neon_dupImm (i32 node:$rhs)))))>; + (v4i32 (Neon_vdup (i32 node:$rhs)))))>; def Neon_lshrImm2D : PatFrag<(ops node:$lhs, node:$rhs), (v2i64 (srl (v2i64 node:$lhs), - (v2i64 (Neon_dupImm (i32 node:$rhs)))))>; + (v2i64 (Neon_vdup (i32 node:$rhs)))))>; def Neon_ashrImm8H : PatFrag<(ops node:$lhs, node:$rhs), (v8i16 (sra (v8i16 node:$lhs), - (v8i16 (Neon_dupImm (i32 node:$rhs)))))>; + (v8i16 (Neon_vdup (i32 node:$rhs)))))>; def Neon_ashrImm4S : PatFrag<(ops node:$lhs, node:$rhs), (v4i32 (sra (v4i32 node:$lhs), - (v4i32 (Neon_dupImm (i32 node:$rhs)))))>; + (v4i32 (Neon_vdup (i32 node:$rhs)))))>; def Neon_ashrImm2D : PatFrag<(ops node:$lhs, node:$rhs), (v2i64 (sra (v2i64 node:$lhs), - (v2i64 (Neon_dupImm (i32 node:$rhs)))))>; + (v2i64 (Neon_vdup (i32 node:$rhs)))))>; // Normal shift right narrow is matched by IR (srl/sra, trunc, concat_vectors) multiclass Neon_shiftNarrow_patterns { def : Pat<(v8i8 (trunc (!cast("Neon_" # shr # "Imm8H") VPR128:$Rn, - imm:$Imm))), + (i32 imm:$Imm)))), (SHRNvvi_8B VPR128:$Rn, imm:$Imm)>; def : Pat<(v4i16 (trunc (!cast("Neon_" # shr # "Imm4S") VPR128:$Rn, - imm:$Imm))), + (i32 imm:$Imm)))), (SHRNvvi_4H VPR128:$Rn, imm:$Imm)>; def : Pat<(v2i32 (trunc (!cast("Neon_" # shr # "Imm2D") VPR128:$Rn, - imm:$Imm))), + (i32 imm:$Imm)))), (SHRNvvi_2S VPR128:$Rn, imm:$Imm)>; def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert (v8i8 (trunc (!cast("Neon_" # shr # "Imm8H") - VPR128:$Rn, imm:$Imm)))))), - (SHRNvvi_16B (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), + VPR128:$Rn, (i32 imm:$Imm))))))), + (SHRNvvi_16B (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)), VPR128:$Rn, imm:$Imm)>; def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert (v4i16 (trunc (!cast("Neon_" # shr # "Imm4S") - VPR128:$Rn, imm:$Imm)))))), + VPR128:$Rn, (i32 imm:$Imm))))))), (SHRNvvi_8H (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), VPR128:$Rn, imm:$Imm)>; def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert (v2i32 (trunc (!cast("Neon_" # shr # "Imm2D") - VPR128:$Rn, imm:$Imm)))))), + VPR128:$Rn, (i32 imm:$Imm))))))), (SHRNvvi_4S (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), VPR128:$Rn, imm:$Imm)>; } @@ -2486,13 +2485,13 @@ { def _8h : PatFrag<(ops node:$Rn), (v8i8 (trunc (v8i16 (srl (v8i16 node:$Rn), - (v8i16 (Neon_dupImm 8))))))>; + (v8i16 (Neon_vdup (i32 8)))))))>; def _4s : PatFrag<(ops node:$Rn), (v4i16 (trunc (v4i32 (srl (v4i32 node:$Rn), - (v4i32 (Neon_dupImm 16))))))>; + (v4i32 (Neon_vdup (i32 16)))))))>; def _2d : PatFrag<(ops node:$Rn), (v2i32 (trunc (v2i64 (srl (v2i64 node:$Rn), - (v2i64 (Neon_dupImm 32))))))>; + (v2i64 (Neon_vdup (i32 32)))))))>; } defm NI_get_hi : NeonI_get_high; @@ -3348,36 +3347,49 @@ def : Pat<(v4f32 (bitconvert (v16i8 VPR128:$src))), (v4f32 VPR128:$src)>; def : Pat<(v4i32 (bitconvert (v16i8 VPR128:$src))), (v4i32 VPR128:$src)>; def : Pat<(v8i16 (bitconvert (v16i8 VPR128:$src))), (v8i16 VPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v16i8 VPR128:$src))), (v8f16 VPR128:$src)>; def : Pat<(v2f64 (bitconvert (v8i16 VPR128:$src))), (v2f64 VPR128:$src)>; def : Pat<(v2i64 (bitconvert (v8i16 VPR128:$src))), (v2i64 VPR128:$src)>; def : Pat<(v4i32 (bitconvert (v8i16 VPR128:$src))), (v4i32 VPR128:$src)>; def : Pat<(v4f32 (bitconvert (v8i16 VPR128:$src))), (v4f32 VPR128:$src)>; def : Pat<(v16i8 (bitconvert (v8i16 VPR128:$src))), (v16i8 VPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v8i16 VPR128:$src))), (v8f16 VPR128:$src)>; + +def : Pat<(v2f64 (bitconvert (v8f16 VPR128:$src))), (v2f64 VPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v8f16 VPR128:$src))), (v2i64 VPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v8f16 VPR128:$src))), (v4i32 VPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v8f16 VPR128:$src))), (v8i16 VPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v8f16 VPR128:$src))), (v16i8 VPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v8f16 VPR128:$src))), (v4f32 VPR128:$src)>; def : Pat<(v2f64 (bitconvert (v4i32 VPR128:$src))), (v2f64 VPR128:$src)>; def : Pat<(v2i64 (bitconvert (v4i32 VPR128:$src))), (v2i64 VPR128:$src)>; def : Pat<(v4f32 (bitconvert (v4i32 VPR128:$src))), (v4f32 VPR128:$src)>; def : Pat<(v8i16 (bitconvert (v4i32 VPR128:$src))), (v8i16 VPR128:$src)>; def : Pat<(v16i8 (bitconvert (v4i32 VPR128:$src))), (v16i8 VPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v4i32 VPR128:$src))), (v8f16 VPR128:$src)>; def : Pat<(v2f64 (bitconvert (v4f32 VPR128:$src))), (v2f64 VPR128:$src)>; def : Pat<(v2i64 (bitconvert (v4f32 VPR128:$src))), (v2i64 VPR128:$src)>; def : Pat<(v4i32 (bitconvert (v4f32 VPR128:$src))), (v4i32 VPR128:$src)>; def : Pat<(v8i16 (bitconvert (v4f32 VPR128:$src))), (v8i16 VPR128:$src)>; def : Pat<(v16i8 (bitconvert (v4f32 VPR128:$src))), (v16i8 VPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v4f32 VPR128:$src))), (v8f16 VPR128:$src)>; def : Pat<(v2f64 (bitconvert (v2i64 VPR128:$src))), (v2f64 VPR128:$src)>; def : Pat<(v4f32 (bitconvert (v2i64 VPR128:$src))), (v4f32 VPR128:$src)>; def : Pat<(v4i32 (bitconvert (v2i64 VPR128:$src))), (v4i32 VPR128:$src)>; def : Pat<(v8i16 (bitconvert (v2i64 VPR128:$src))), (v8i16 VPR128:$src)>; def : Pat<(v16i8 (bitconvert (v2i64 VPR128:$src))), (v16i8 VPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v2i64 VPR128:$src))), (v8f16 VPR128:$src)>; def : Pat<(v2i64 (bitconvert (v2f64 VPR128:$src))), (v2i64 VPR128:$src)>; def : Pat<(v4f32 (bitconvert (v2f64 VPR128:$src))), (v4f32 VPR128:$src)>; def : Pat<(v4i32 (bitconvert (v2f64 VPR128:$src))), (v4i32 VPR128:$src)>; def : Pat<(v8i16 (bitconvert (v2f64 VPR128:$src))), (v8i16 VPR128:$src)>; def : Pat<(v16i8 (bitconvert (v2f64 VPR128:$src))), (v16i8 VPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v2f64 VPR128:$src))), (v8f16 VPR128:$src)>; // ...and scalar bitcasts... @@ -3402,6 +3414,7 @@ def : Pat<(f128 (bitconvert (v16i8 VPR128:$src))), (f128 VPR128:$src)>; def : Pat<(f128 (bitconvert (v8i16 VPR128:$src))), (f128 VPR128:$src)>; +def : Pat<(f128 (bitconvert (v8f16 VPR128:$src))), (f128 VPR128:$src)>; def : Pat<(f128 (bitconvert (v4i32 VPR128:$src))), (f128 VPR128:$src)>; def : Pat<(f128 (bitconvert (v2i64 VPR128:$src))), (f128 VPR128:$src)>; def : Pat<(f128 (bitconvert (v4f32 VPR128:$src))), (f128 VPR128:$src)>; @@ -3424,6 +3437,7 @@ def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))), (v8f16 FPR128:$src)>; def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), (v4f32 FPR128:$src)>; @@ -4327,6 +4341,46 @@ // bits 11-13 are unspecified. } +multiclass Neon_INS_elt_float_pattern { +def : Pat<(ResTy (vector_insert + (ResTy VPR128:$src), + (MidTy (vector_extract + (ResTy VPR128:$Rn), + (ResImm:$Immn))), + (ResImm:$Immd))), + (INS (ResTy VPR128:$src), (ResTy VPR128:$Rn), + ResImm:$Immd, ResImm:$Immn)>; + +def : Pat <(ResTy (vector_insert + (ResTy VPR128:$src), + (MidTy OpFPR:$Rn), + (ResImm:$Imm))), + (INS (ResTy VPR128:$src), + (ResTy (SUBREG_TO_REG (i64 0), OpFPR:$Rn, SubIndex)), + ResImm:$Imm, + (i64 0))>; + +def : Pat <(NaTy (vector_insert + (NaTy VPR64:$src), + (MidTy OpFPR:$Rn), + (ResImm:$Imm))), + (NaTy (EXTRACT_SUBREG + (ResTy (INS + (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)), + (ResTy (SUBREG_TO_REG (i64 0), (MidTy OpFPR:$Rn), SubIndex)), + ResImm:$Imm, + (i64 0))), + sub_64))>; +} + +defm : Neon_INS_elt_float_pattern; +defm : Neon_INS_elt_float_pattern; + multiclass Neon_INS_elt_pattern { @@ -4371,14 +4425,15 @@ sub_64))>; } -defm INSb_pattern : Neon_INS_elt_pattern; -defm INSh_pattern : Neon_INS_elt_pattern; -defm INSs_pattern : Neon_INS_elt_pattern; -defm INSd_pattern : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; + class NeonI_SMOV; } -defm SMOVxb_pattern : Neon_SMOVx_pattern; -defm SMOVxh_pattern : Neon_SMOVx_pattern; -defm SMOVxs_pattern : Neon_SMOVx_pattern; +defm : Neon_SMOVx_pattern; +defm : Neon_SMOVx_pattern; +defm : Neon_SMOVx_pattern; class Neon_SMOVw_pattern ; -def SMOVwb_pattern : Neon_SMOVw_pattern; -def SMOVwh_pattern : Neon_SMOVw_pattern; - +def : Neon_SMOVw_pattern; +def : Neon_SMOVw_pattern; class NeonI_UMOV; -def UMOVwb_pattern : Neon_UMOV_pattern; -def UMOVwh_pattern : Neon_UMOV_pattern; -def UMOVws_pattern : Neon_UMOV_pattern; +def : Neon_UMOV_pattern; +def : Neon_UMOV_pattern; +def : Neon_UMOV_pattern; def : Pat<(i32 (and (i32 (vector_extract @@ -4600,4 +4654,179 @@ def : Pat<(v1f32 (scalar_to_vector (f32 FPR32:$Rn))), (v1f32 FPR32:$Rn)>; def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Rn))), - (v1f64 FPR64:$Rn)>; \ No newline at end of file + (v1f64 FPR64:$Rn)>; + +def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$src))), + (FMOVdd $src)>; + +class NeonI_DUP_Elt + : NeonI_copy { + bits<4> Imm; +} + +def DUPELT16b : NeonI_DUP_Elt<0b1, "dup", ".16b", ".b", VPR128, v16i8, v16i8, + neon_uimm4_bare> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} + +def DUPELT8h : NeonI_DUP_Elt<0b1, "dup", ".8h", ".h", VPR128, v8i16, v8i16, + neon_uimm3_bare> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} + +def DUPELT4s : NeonI_DUP_Elt<0b1, "dup", ".4s", ".s", VPR128, v4i32, v4i32, + neon_uimm2_bare> { + let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; +} + +def DUPELT2d : NeonI_DUP_Elt<0b1, "dup", ".2d", ".d", VPR128, v2i64, v2i64, + neon_uimm1_bare> { + let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0}; +} + +def DUPELT8b : NeonI_DUP_Elt<0b0, "dup", ".8b", ".b", VPR64, v8i8, v16i8, + neon_uimm4_bare> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} + +def DUPELT4h : NeonI_DUP_Elt<0b0, "dup", ".4h", ".h", VPR64, v4i16, v8i16, + neon_uimm3_bare> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} + +def DUPELT2s : NeonI_DUP_Elt<0b0, "dup", ".2s", ".s", VPR64, v2i32, v4i32, + neon_uimm2_bare> { + let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; +} + +multiclass NeonI_DUP_Elt_pattern { +def : Pat<(ResTy (Neon_vduplane (OpTy VPR128:$Rn), OpLImm:$Imm)), + (ResTy (DUPELT (OpTy VPR128:$Rn), OpLImm:$Imm))>; + +def : Pat<(ResTy (Neon_vduplane + (NaTy VPR64:$Rn), OpNImm:$Imm)), + (ResTy (DUPELT + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), OpNImm:$Imm))>; +} +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; + +def : Pat<(v2f32 (Neon_vdup (f32 FPR32:$Rn))), + (v2f32 (DUPELT2s + (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32), + (i64 0)))>; +def : Pat<(v4f32 (Neon_vdup (f32 FPR32:$Rn))), + (v4f32 (DUPELT4s + (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32), + (i64 0)))>; +def : Pat<(v2f64 (Neon_vdup (f64 FPR64:$Rn))), + (v2f64 (DUPELT2d + (SUBREG_TO_REG (i64 0), FPR64:$Rn, sub_64), + (i64 0)))>; + +class NeonI_DUP + : NeonI_copy; + +def DUP16b : NeonI_DUP<0b1, "dup", ".16b", VPR128, v16i8, GPR32, i32> { + let Inst{16} = 0b1; + // bits 17-19 are unspecified. +} + +def DUP8h : NeonI_DUP<0b1, "dup", ".8h", VPR128, v8i16, GPR32, i32> { + let Inst{17-16} = 0b10; + // bits 18-19 are unspecified. +} + +def DUP4s : NeonI_DUP<0b1, "dup", ".4s", VPR128, v4i32, GPR32, i32> { + let Inst{18-16} = 0b100; + // bit 19 is unspecified. +} + +def DUP2d : NeonI_DUP<0b1, "dup", ".2d", VPR128, v2i64, GPR64, i64> { + let Inst{19-16} = 0b1000; +} + +def DUP8b : NeonI_DUP<0b0, "dup", ".8b", VPR64, v8i8, GPR32, i32> { + let Inst{16} = 0b1; + // bits 17-19 are unspecified. +} + +def DUP4h : NeonI_DUP<0b0, "dup", ".4h", VPR64, v4i16, GPR32, i32> { + let Inst{17-16} = 0b10; + // bits 18-19 are unspecified. +} + +def DUP2s : NeonI_DUP<0b0, "dup", ".2s", VPR64, v2i32, GPR32, i32> { + let Inst{18-16} = 0b100; + // bit 19 is unspecified. +} + +// patterns for CONCAT_VECTORS +multiclass Concat_Vector_Pattern { +def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), undef)), + (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)>; +def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rm))), + (INSELd + (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rm, sub_64)), + (i64 1), + (i64 0))>; +def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rn))), + (DUPELT2d + (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + (i64 0))> ; +} + +defm : Concat_Vector_Pattern; +defm : Concat_Vector_Pattern; +defm : Concat_Vector_Pattern; +defm : Concat_Vector_Pattern; +defm : Concat_Vector_Pattern; +defm : Concat_Vector_Pattern; + +//patterns for EXTRACT_SUBVECTOR +def : Pat<(v8i8 (extract_subvector (v16i8 VPR128:$Rn), (i64 0))), + (v8i8 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v4i16 (extract_subvector (v8i16 VPR128:$Rn), (i64 0))), + (v4i16 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v2i32 (extract_subvector (v4i32 VPR128:$Rn), (i64 0))), + (v2i32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v1i64 (extract_subvector (v2i64 VPR128:$Rn), (i64 0))), + (v1i64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v2f32 (extract_subvector (v4f32 VPR128:$Rn), (i64 0))), + (v2f32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v1f64 (extract_subvector (v2f64 VPR128:$Rn), (i64 0))), + (v1f64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; \ No newline at end of file Index: lib/Target/AArch64/AArch64RegisterInfo.td =================================================================== --- lib/Target/AArch64/AArch64RegisterInfo.td +++ lib/Target/AArch64/AArch64RegisterInfo.td @@ -150,7 +150,8 @@ 64, (sequence "D%u", 0, 31)>; def FPR128 : RegisterClass<"AArch64", - [f128,v2f64, v2i64, v4f32, v4i32, v8i16, v16i8], + [f128, v2f64, v2i64, v4f32, + v4i32, v8f16, v8i16, v16i8], 128, (sequence "Q%u", 0, 31)>; def FPR64Lo : RegisterClass<"AArch64", Index: test/CodeGen/AArch64/neon-copy.ll =================================================================== --- test/CodeGen/AArch64/neon-copy.ll +++ test/CodeGen/AArch64/neon-copy.ll @@ -71,6 +71,62 @@ ret <2 x i64> %tmp4 } +define <16 x i8> @ins8b16(<8 x i8> %tmp1, <16 x i8> %tmp2) { +;CHECK: ins {{v[0-31]+}}.b[15], {{v[0-31]+}}.b[2] + %tmp3 = extractelement <8 x i8> %tmp1, i32 2 + %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15 + ret <16 x i8> %tmp4 +} + +define <8 x i16> @ins4h8(<4 x i16> %tmp1, <8 x i16> %tmp2) { +;CHECK: ins {{v[0-31]+}}.h[7], {{v[0-31]+}}.h[2] + %tmp3 = extractelement <4 x i16> %tmp1, i32 2 + %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7 + ret <8 x i16> %tmp4 +} + +define <4 x i32> @ins2s4(<2 x i32> %tmp1, <4 x i32> %tmp2) { +;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[1] + %tmp3 = extractelement <2 x i32> %tmp1, i32 1 + %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1 + ret <4 x i32> %tmp4 +} + +define <2 x i64> @ins1d2(<1 x i64> %tmp1, <2 x i64> %tmp2) { +;CHECK: ins {{v[0-31]+}}.d[1], {{v[0-31]+}}.d[0] + %tmp3 = extractelement <1 x i64> %tmp1, i32 0 + %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1 + ret <2 x i64> %tmp4 +} + +define <8 x i8> @ins16b8(<16 x i8> %tmp1, <8 x i8> %tmp2) { +;CHECK: ins {{v[0-31]+}}.b[7], {{v[0-31]+}}.b[2] + %tmp3 = extractelement <16 x i8> %tmp1, i32 2 + %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 7 + ret <8 x i8> %tmp4 +} + +define <4 x i16> @ins8h4(<8 x i16> %tmp1, <4 x i16> %tmp2) { +;CHECK: ins {{v[0-31]+}}.h[3], {{v[0-31]+}}.h[2] + %tmp3 = extractelement <8 x i16> %tmp1, i32 2 + %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3 + ret <4 x i16> %tmp4 +} + +define <2 x i32> @ins4s2(<4 x i32> %tmp1, <2 x i32> %tmp2) { +;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[2] + %tmp3 = extractelement <4 x i32> %tmp1, i32 2 + %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1 + ret <2 x i32> %tmp4 +} + +define <1 x i64> @ins2d1(<2 x i64> %tmp1, <1 x i64> %tmp2) { +;CHECK: ins {{v[0-31]+}}.d[0], {{v[0-31]+}}.d[0] + %tmp3 = extractelement <2 x i64> %tmp1, i32 0 + %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0 + ret <1 x i64> %tmp4 +} + define <8 x i8> @ins8b8(<8 x i8> %tmp1, <8 x i8> %tmp2) { ;CHECK: ins {{v[0-31]+}}.b[4], {{v[0-31]+}}.b[2] %tmp3 = extractelement <8 x i8> %tmp1, i32 2 @@ -99,6 +155,32 @@ ret <1 x i64> %tmp4 } +define <4 x float> @ins4f4(<4 x float> %tmp1, <4 x float> %tmp2) { +;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[2] + %tmp3 = extractelement <4 x float> %tmp1, i32 2 + %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1 + ret <4 x float> %tmp4 +} + +define <2 x double> @ins2f2(<2 x double> %tmp1, <2 x double> %tmp2) { +;CHECK: ins {{v[0-31]+}}.d[1], {{v[0-31]+}}.d[0] + %tmp3 = extractelement <2 x double> %tmp1, i32 0 + %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1 + ret <2 x double> %tmp4 +} + +define <4 x float> @insf(<4 x float> %tmp1, float %tmp2) { +;CHECK: ins {{v[0-31]+}}.s[3], {{v[0-31]+}}.s[0] + %tmp3 = insertelement <4 x float> %tmp1, float %tmp2, i32 3 + ret <4 x float> %tmp3 +} + +define <2 x double> @insd(<2 x double> %tmp1, double %tmp2) { +;CHECK: ins {{v[0-31]+}}.d[1], {{v[0-31]+}}.d[0] + %tmp3 = insertelement <2 x double> %tmp1, double %tmp2, i32 1 + ret <2 x double> %tmp3 +} + define i32 @umovw16b(<16 x i8> %tmp1) { ;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.b[8] %tmp3 = extractelement <16 x i8> %tmp1, i32 8 @@ -225,8 +307,196 @@ ret i64 %tmp4 } +define <8 x i8> @test_vcopy_lane_s8(<8 x i8> %v1, <8 x i8> %v2) { +;CHECK: ins v0.b[5], v1.b[3] + %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> + ret <8 x i8> %vset_lane +} + +define <16 x i8> @test_vcopyq_laneq_s8(<16 x i8> %v1, <16 x i8> %v2) { +;CHECK: ins v0.b[14], v1.b[6] + %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> + ret <16 x i8> %vset_lane +} + +define <8 x i8> @test_vcopy_lane_swap_s8(<8 x i8> %v1, <8 x i8> %v2) { +;CHECK: ins v1.b[7], v0.b[0] + %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> + ret <8 x i8> %vset_lane +} + +define <16 x i8> @test_vcopyq_laneq_swap_s8(<16 x i8> %v1, <16 x i8> %v2) { +;CHECK: ins v1.b[0], v0.b[15] + %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> + ret <16 x i8> %vset_lane +} + +define <8 x i8> @test_vdup_n_u8(i8 %v1) #0 { +;CHECK: dup v0.8b, w0 + %vecinit.i = insertelement <8 x i8> undef, i8 %v1, i32 0 + %vecinit1.i = insertelement <8 x i8> %vecinit.i, i8 %v1, i32 1 + %vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 %v1, i32 2 + %vecinit3.i = insertelement <8 x i8> %vecinit2.i, i8 %v1, i32 3 + %vecinit4.i = insertelement <8 x i8> %vecinit3.i, i8 %v1, i32 4 + %vecinit5.i = insertelement <8 x i8> %vecinit4.i, i8 %v1, i32 5 + %vecinit6.i = insertelement <8 x i8> %vecinit5.i, i8 %v1, i32 6 + %vecinit7.i = insertelement <8 x i8> %vecinit6.i, i8 %v1, i32 7 + ret <8 x i8> %vecinit7.i +} + +define <4 x i16> @test_vdup_n_u16(i16 %v1) #0 { +;CHECK: dup v0.4h, w0 + %vecinit.i = insertelement <4 x i16> undef, i16 %v1, i32 0 + %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %v1, i32 1 + %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %v1, i32 2 + %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %v1, i32 3 + ret <4 x i16> %vecinit3.i +} + +define <2 x i32> @test_vdup_n_u32(i32 %v1) #0 { +;CHECK: dup v0.2s, w0 + %vecinit.i = insertelement <2 x i32> undef, i32 %v1, i32 0 + %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %v1, i32 1 + ret <2 x i32> %vecinit1.i +} + +define <1 x i64> @test_vdup_n_u64(i64 %v1) #0 { +;CHECK: fmov d0, x0 + %vecinit.i = insertelement <1 x i64> undef, i64 %v1, i32 0 + ret <1 x i64> %vecinit.i +} + +define <16 x i8> @test_vdupq_n_u8(i8 %v1) #0 { +;CHECK: dup v0.16b, w0 + %vecinit.i = insertelement <16 x i8> undef, i8 %v1, i32 0 + %vecinit1.i = insertelement <16 x i8> %vecinit.i, i8 %v1, i32 1 + %vecinit2.i = insertelement <16 x i8> %vecinit1.i, i8 %v1, i32 2 + %vecinit3.i = insertelement <16 x i8> %vecinit2.i, i8 %v1, i32 3 + %vecinit4.i = insertelement <16 x i8> %vecinit3.i, i8 %v1, i32 4 + %vecinit5.i = insertelement <16 x i8> %vecinit4.i, i8 %v1, i32 5 + %vecinit6.i = insertelement <16 x i8> %vecinit5.i, i8 %v1, i32 6 + %vecinit7.i = insertelement <16 x i8> %vecinit6.i, i8 %v1, i32 7 + %vecinit8.i = insertelement <16 x i8> %vecinit7.i, i8 %v1, i32 8 + %vecinit9.i = insertelement <16 x i8> %vecinit8.i, i8 %v1, i32 9 + %vecinit10.i = insertelement <16 x i8> %vecinit9.i, i8 %v1, i32 10 + %vecinit11.i = insertelement <16 x i8> %vecinit10.i, i8 %v1, i32 11 + %vecinit12.i = insertelement <16 x i8> %vecinit11.i, i8 %v1, i32 12 + %vecinit13.i = insertelement <16 x i8> %vecinit12.i, i8 %v1, i32 13 + %vecinit14.i = insertelement <16 x i8> %vecinit13.i, i8 %v1, i32 14 + %vecinit15.i = insertelement <16 x i8> %vecinit14.i, i8 %v1, i32 15 + ret <16 x i8> %vecinit15.i +} + +define <8 x i16> @test_vdupq_n_u16(i16 %v1) #0 { +;CHECK: dup v0.8h, w0 + %vecinit.i = insertelement <8 x i16> undef, i16 %v1, i32 0 + %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %v1, i32 1 + %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %v1, i32 2 + %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %v1, i32 3 + %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %v1, i32 4 + %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %v1, i32 5 + %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %v1, i32 6 + %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %v1, i32 7 + ret <8 x i16> %vecinit7.i +} + +define <4 x i32> @test_vdupq_n_u32(i32 %v1) #0 { +;CHECK: dup v0.4s, w0 + %vecinit.i = insertelement <4 x i32> undef, i32 %v1, i32 0 + %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %v1, i32 1 + %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %v1, i32 2 + %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %v1, i32 3 + ret <4 x i32> %vecinit3.i +} +define <2 x i64> @test_vdupq_n_u64(i64 %v1) #0 { +;CHECK: dup v0.2d, x0 + %vecinit.i = insertelement <2 x i64> undef, i64 %v1, i32 0 + %vecinit1.i = insertelement <2 x i64> %vecinit.i, i64 %v1, i32 1 + ret <2 x i64> %vecinit1.i +} +define <8 x i8> @test_vdup_lane_s8(<8 x i8> %v1) #0 { +;CHECK: dup v0.8b, v0.b[5] + %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle +} +define <4 x i16> @test_vdup_lane_s16(<4 x i16> %v1) #0 { +;CHECK: dup v0.4h, v0.h[2] + %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle +} +define <2 x i32> @test_vdup_lane_s32(<2 x i32> %v1) #0 { +;CHECK: dup v0.2s, v0.s[1] + %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <2 x i32> + ret <2 x i32> %shuffle +} + +define <16 x i8> @test_vdupq_lane_s8(<8 x i8> %v1) #0 { +;CHECK: v0.16b, v0.b[5] + %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <16 x i32> + ret <16 x i8> %shuffle +} + +define <8 x i16> @test_vdupq_lane_s16(<4 x i16> %v1) #0 { +;CHECK: v0.8h, v0.h[2] + %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <8 x i32> + ret <8 x i16> %shuffle +} + +define <4 x i32> @test_vdupq_lane_s32(<2 x i32> %v1) #0 { +;CHECK: v0.4s, v0.s[1] + %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <4 x i32> + ret <4 x i32> %shuffle +} + +define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %v1) #0 { +;CHECK: v0.2d, v0.d[0] + %shuffle = shufflevector <1 x i64> %v1, <1 x i64> undef, <2 x i32> zeroinitializer + ret <2 x i64> %shuffle +} + +define <8 x i8> @test_vdup_laneq_s8(<16 x i8> %v1) #0 { +;CHECK: dup v0.8b, v0.b[5] + %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle +} + +define <4 x i16> @test_vdup_laneq_s16(<8 x i16> %v1) #0 { +;CHECK: dup v0.4h, v0.h[2] + %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle +} + +define <2 x i32> @test_vdup_laneq_s32(<4 x i32> %v1) #0 { +;CHECK: dup v0.2s, v0.s[1] + %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> + ret <2 x i32> %shuffle +} + +define <16 x i8> @test_vdupq_laneq_s8(<16 x i8> %v1) #0 { +;CHECK: dup v0.16b, v0.b[5] + %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> + ret <16 x i8> %shuffle +} + +define <8 x i16> @test_vdupq_laneq_s16(<8 x i16> %v1) #0 { +;CHECK: v0.8h, v0.h[2] + %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> + ret <8 x i16> %shuffle +} + +define <4 x i32> @test_vdupq_laneq_s32(<4 x i32> %v1) #0 { +;CHECK: dup v0.4s, v0.s[1] + %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %shuffle +} + +define <2 x i64> @test_vdupq_laneq_s64(<2 x i64> %v1) #0 { +;CHECK: dup v0.2d, v0.d[0] + %shuffle = shufflevector <2 x i64> %v1, <2 x i64> undef, <2 x i32> zeroinitializer + ret <2 x i64> %shuffle +} Index: test/MC/AArch64/neon-diagnostics.s =================================================================== --- test/MC/AArch64/neon-diagnostics.s +++ test/MC/AArch64/neon-diagnostics.s @@ -3839,3 +3839,187 @@ // CHECK-ERROR: error: invalid operand for instruction // CHECK-ERROR: frsqrts d8, s22, d18 // CHECK-ERROR: ^ + + ins v2.b[16], w1 + ins v7.h[8], w14 + ins v20.s[5], w30 + ins v1.d[2], x7 + ins v2.b[3], b1 + ins v7.h[2], h14 + ins v20.s[1], s30 + ins v1.d[0], d7 + +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: ins v2.b[16], w1 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: ins v7.h[8], w14 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: ins v20.s[5], w30 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: ins v1.d[2], x7 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: ins v2.b[3], b1 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: ins v7.h[2], h14 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: ins v20.s[1], s30 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: ins v1.d[0], d7 +// CHECK-ERROR: ^ + + smov w1, v0.b[16] + smov w14, v6.h[8] + smov x1, v0.b[16] + smov x14, v6.h[8] + smov x20, v9.s[5] + smov w1, v0.d[0] + smov w14, v6.d[1] + smov x1, v0.d[0] + smov x14, v6.d[1] + smov x20, v9.d[0] + +// CHECK-ERROR error: lane number incompatible with layout +// CHECK-ERROR smov w1, v0.b[16] +// CHECK-ERROR ^ +// CHECK-ERROR error: lane number incompatible with layout +// CHECK-ERROR smov w14, v6.h[8] +// CHECK-ERROR ^ +// CHECK-ERROR error: lane number incompatible with layout +// CHECK-ERROR smov x1, v0.b[16] +// CHECK-ERROR ^ +// CHECK-ERROR error: lane number incompatible with layout +// CHECK-ERROR smov x14, v6.h[8] +// CHECK-ERROR ^ +// CHECK-ERROR error: lane number incompatible with layout +// CHECK-ERROR smov x20, v9.s[5] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR smov w1, v0.d[0] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR smov w14, v6.d[1] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR smov x1, v0.d[0] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR smov x14, v6.d[1] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR smov x20, v9.d[0] +// CHECK-ERROR ^ + + umov w1, v0.b[16] + umov w14, v6.h[8] + umov w20, v9.s[5] + umov x7, v18.d[3] + umov w1, v0.d[0] + umov s20, v9.s[2] + umov d7, v18.d[1] + +// CHECK-ERROR error: lane number incompatible with layout +// CHECK-ERROR umov w1, v0.b[16] +// CHECK-ERROR ^ +// CHECK-ERROR error: lane number incompatible with layout +// CHECK-ERROR umov w14, v6.h[8] +// CHECK-ERROR ^ +// CHECK-ERROR error: lane number incompatible with layout +// CHECK-ERROR umov w20, v9.s[5] +// CHECK-ERROR ^ +// CHECK-ERROR error: lane number incompatible with layout +// CHECK-ERROR umov x7, v18.d[3] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR umov w1, v0.d[0] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR umov s20, v9.s[2] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR umov d7, v18.d[1] +// CHECK-ERROR ^ + + Ins v1.h[2], v3.b[6] + Ins v6.h[7], v7.s[2] + Ins v15.d[0], v22.s[2] + Ins v0.d[0], v4.b[1] + +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR Ins v1.h[2], v3.b[6] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR Ins v6.h[7], v7.s[2] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR Ins v15.d[0], v22.s[2] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR Ins v0.d[0], v4.b[1] +// CHECK-ERROR ^ + + dup v1.8h, v2.b[2] + dup v11.4s, v7.h[7] + dup v17.2d, v20.s[0] + dup v1.16b, v2.h[2] + dup v11.8h, v7.s[3] + dup v17.4s, v20.d[0] + dup v5.2d, v1.b[1] + +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR dup v1.8h, v2.b[2] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR dup v11.4s, v7.h[7] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR dup v17.2d, v20.s[0] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR dup v1.16b, v2.h[2] +// CHECK-ERROR ^ +// CHECK-ERROR invalid operand for instruction +// CHECK-ERROR dup v11.8h, v7.s[3] +// CHECK-ERROR ^ +// CHECK-ERROR invalid operand for instruction +// CHECK-ERROR dup v17.4s, v20.d[0] +// CHECK-ERROR ^ +// CHECK-ERROR invalid operand for instruction +// CHECK-ERROR dup v5.2d, v1.b[1] +// CHECK-ERROR ^ + + dup v1.8b, b1 + dup v11.4h, h14 + dup v17.2s, s30 + dup v1.16b, d2 + dup v11.8s, w16 + dup v17.4d, w28 + dup v5.2d, w0 + +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR dup v1.8b, b1 +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR dup v11.4h, h14 +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR dup v17.2s, s30 +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR dup v1.16b, d2 +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR dup v11.8s, w16 +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR dup v17.4d, w28 +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR dup v5.2d, w0 +// CHECK-ERROR ^ Index: test/MC/AArch64/neon-simd-copy.s =================================================================== --- test/MC/AArch64/neon-simd-copy.s +++ test/MC/AArch64/neon-simd-copy.s @@ -60,6 +60,44 @@ // CHECK: ins v15.s[3], v22.s[2] // encoding: [0xcf,0x5e,0x1c,0x6e] // CHECK: ins v0.d[0], v4.d[1] // encoding: [0x80,0x44,0x08,0x6e] +//------------------------------------------------------------------------------ +// Duplicate to all lanes( vector, from element) +//------------------------------------------------------------------------------ + dup v1.8b, v2.b[2] + dup v11.4h, v7.h[7] + dup v17.2s, v20.s[0] + dup v1.16b, v2.b[2] + dup v11.8h, v7.h[7] + dup v17.4s, v20.s[0] + dup v5.2d, v1.d[1] + +// CHECK: dup v1.8b, v2.b[2] // encoding: [0x41,0x04,0x05,0x0e] +// CHECK: dup v11.4h, v7.h[7] // encoding: [0xeb,0x04,0x1e,0x0e] +// CHECK: dup v17.2s, v20.s[0] // encoding: [0x91,0x06,0x04,0x0e] +// CHECK: dup v1.16b, v2.b[2] // encoding: [0x41,0x04,0x05,0x4e] +// CHECK: dup v11.8h, v7.h[7] // encoding: [0xeb,0x04,0x1e,0x4e] +// CHECK: dup v17.4s, v20.s[0] // encoding: [0x91,0x06,0x04,0x4e] +// CHECK: dup v5.2d, v1.d[1] // encoding: [0x25,0x04,0x18,0x4e] + +//------------------------------------------------------------------------------ +// Duplicate to all lanes( vector, from main) +//------------------------------------------------------------------------------ + dup v1.8b, w1 + dup v11.4h, w14 + dup v17.2s, w30 + dup v1.16b, w2 + dup v11.8h, w16 + dup v17.4s, w28 + dup v5.2d, x0 + +// CHECK: dup v1.8b, w1 // encoding: [0x21,0x0c,0x01,0x0e] +// CHECK: dup v11.4h, w14 // encoding: [0xcb,0x0d,0x0a,0x0e] +// CHECK: dup v17.2s, w30 // encoding: [0xd1,0x0f,0x14,0x0e] +// CHECK: dup v1.16b, w2 // encoding: [0x41,0x0c,0x01,0x4e] +// CHECK: dup v11.8h, w16 // encoding: [0x0b,0x0e,0x0a,0x4e] +// CHECK: dup v17.4s, w28 // encoding: [0x91,0x0f,0x14,0x4e] +// CHECK: dup v5.2d, x0 // encoding: [0x05,0x0c,0x08,0x4e] +