Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -134,13 +134,13 @@ // Vector compare bitwise test NEON_TST, - // Operation for the immediate in vector shift - NEON_DUPIMM, - // Vector saturating shift NEON_QSHLs, NEON_QSHLu, + // Vector dup + NEON_VDUP, + // Vector dup by lane NEON_VDUPLANE }; @@ -292,6 +292,10 @@ Neon_Mov_Imm, Neon_Mvn_Imm }; + +extern SDValue ScanBUILD_VECTOR(SDValue Op, bool &isOnlyLowElement, + bool &usesOnlyOneValue, bool &hasDominantValue, + bool &isConstant, bool &isUNDEF); } // namespace llvm #endif // LLVM_TARGET_AARCH64_ISELLOWERING_H Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -297,15 +297,23 @@ setOperationAction(ISD::BUILD_VECTOR, MVT::v1f64, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i32, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1f64, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal); @@ -866,12 +874,12 @@ return "AArch64ISD::NEON_CMPZ"; case AArch64ISD::NEON_TST: return "AArch64ISD::NEON_TST"; - case AArch64ISD::NEON_DUPIMM: - return "AArch64ISD::NEON_DUPIMM"; case AArch64ISD::NEON_QSHLs: return "AArch64ISD::NEON_QSHLs"; case AArch64ISD::NEON_QSHLu: return "AArch64ISD::NEON_QSHLu"; + case AArch64ISD::NEON_VDUP: + return "AArch64ISD::NEON_VDUP"; case AArch64ISD::NEON_VDUPLANE: return "AArch64ISD::NEON_VDUPLANE"; default: @@ -3342,7 +3350,7 @@ case ISD::SHL: if (isVShiftLImm(N->getOperand(1), VT, Cnt)) { SDValue RHS = - DAG.getNode(AArch64ISD::NEON_DUPIMM, SDLoc(N->getOperand(1)), VT, + DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT, DAG.getConstant(Cnt, MVT::i32)); return DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), RHS); } @@ -3352,7 +3360,7 @@ case ISD::SRL: if (isVShiftRImm(N->getOperand(1), VT, Cnt)) { SDValue RHS = - DAG.getNode(AArch64ISD::NEON_DUPIMM, SDLoc(N->getOperand(1)), VT, + DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT, DAG.getConstant(Cnt, MVT::i32)); return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N->getOperand(0), RHS); } @@ -3492,6 +3500,107 @@ } } } + + unsigned NumElts = VT.getVectorNumElements(); + bool isOnlyLowElement = true; + bool usesOnlyOneValue = true; + bool hasDominantValue = false; + bool isConstant = true; + + // Map of the number of times a particular SDValue appears in the + // element list. + DenseMap ValueCounts; + SDValue Value; + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + if (i > 0) + isOnlyLowElement = false; + if (!isa(V) && !isa(V)) + isConstant = false; + + ValueCounts.insert(std::make_pair(V, 0)); + unsigned &Count = ValueCounts[V]; + + // Is this value dominant? (takes up more than half of the lanes) + if (++Count > (NumElts / 2)) { + hasDominantValue = true; + Value = V; + } + } + if (ValueCounts.size() != 1) + usesOnlyOneValue = false; + if (!Value.getNode() && ValueCounts.size() > 0) + Value = ValueCounts.begin()->first; + + if (ValueCounts.size() == 0) + return DAG.getUNDEF(VT); + + // Loads are better lowered with insert_vector_elt. + // Keep going if we are hitting this case. + if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) + return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value); + + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + // Use VDUP for non-constant splats. + if (hasDominantValue && EltSize <= 64) { + if (!isConstant) { + SDValue N; + + // If we are DUPing a value that comes directly from a vector, we could + // just use DUPLANE. We can only do this if the lane being extracted + // is at a constant index, as the DUP from lane instructions only have + // constant-index forms. + if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + isa(Value->getOperand(1))) { + N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT, + Value->getOperand(0), Value->getOperand(1)); + } else + N = DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value); + + if (!usesOnlyOneValue) { + // The dominant value was splatted as 'N', but we now have to insert + // all differing elements. + for (unsigned I = 0; I < NumElts; ++I) { + if (Op.getOperand(I) == Value) + continue; + SmallVector Ops; + Ops.push_back(N); + Ops.push_back(Op.getOperand(I)); + Ops.push_back(DAG.getConstant(I, MVT::i32)); + N = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, &Ops[0], 3); + } + } + return N; + } + if (usesOnlyOneValue && isConstant) { + return DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value); + } + } + // If all elements are constants and the case above didn't get hit, fall back + // to the default expansion, which will generate a load from the constant + // pool. + if (isConstant) + return SDValue(); + + // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we + // know the default expansion would otherwise fall back on something even + // worse. For a vector with one or two non-undef values, that's + // scalar_to_vector for the elements followed by a shuffle (provided the + // shuffle is valid for the target) and materialization element by element + // on the stack followed by a load for everything else. + if (!isConstant && !usesOnlyOneValue) { + SDValue Vec = DAG.getUNDEF(VT); + for (unsigned i = 0 ; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + SDValue LaneIdx = DAG.getConstant(i, MVT::i32); + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx); + } + return Vec; + } return SDValue(); } @@ -3499,6 +3608,7 @@ AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); SDLoc dl(Op); EVT VT = Op.getValueType(); ShuffleVectorSDNode *SVN = cast(Op.getNode()); @@ -3516,9 +3626,89 @@ // If this is undef splat, generate it via "just" vdup, if possible. if (Lane == -1) Lane = 0; + // Test if V1 is a SCALAR_TO_VECTOR. + if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { + return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT, V1.getOperand(0)); + } + // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR. + if (V1.getOpcode() == ISD::BUILD_VECTOR) { + bool IsScalarToVector = true; + for (unsigned i = 0, e = V1.getNumOperands(); i != e; ++i) + if (V1.getOperand(i).getOpcode() != ISD::UNDEF && + i != (unsigned)Lane) { + IsScalarToVector = false; + break; + } + if (IsScalarToVector) + return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT, + V1.getOperand(Lane)); + } return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1, DAG.getConstant(Lane, MVT::i64)); } + // For shuffle mask like "0, 1, 2, 3, 4, 5, 13, 7", try to generate insert + // by element from V2 to V1 . + // If shuffle mask is like "0, 1, 10, 11, 12, 13, 14, 15", V2 would be a + // better choice to be inserted than V1 as less insert needed, so we count + // element to be inserted for both V1 and V2, and select less one as insert + // target. + + // Collect elements need to be inserted and their index. + SmallVector NV1Elt; + SmallVector N1Index; + SmallVector NV2Elt; + SmallVector N2Index; + int Length = ShuffleMask.size(); + int V1EltNum = V1.getValueType().getVectorNumElements(); + for (int I = 0; I != Length; ++I) { + if (ShuffleMask[I] != I) { + NV1Elt.push_back(ShuffleMask[I]); + N1Index.push_back(I); + } + } + for (int I = 0; I != Length; ++I) { + if (ShuffleMask[I] != (I + V1EltNum)) { + NV2Elt.push_back(ShuffleMask[I]); + N2Index.push_back(I); + } + } + + // Decide which to be inserted. If all lanes mismatch, neither V1 nor V2 + // will be inserted. + SDValue InsV = V1; + SmallVector InsMasks = NV1Elt; + SmallVector InsIndex = N1Index; + if ((int)NV1Elt.size() != Length || (int)NV2Elt.size() != Length) { + if (NV1Elt.size() > NV2Elt.size()) { + InsV = V2; + InsMasks = NV2Elt; + InsIndex = N2Index; + } + } else { + InsV = DAG.getNode(ISD::UNDEF, dl, VT); + } + + SDValue PassN; + + for (int I = 0, E = InsMasks.size(); I != E; ++I) { + SDValue ExtV = V1; + int Mask = InsMasks[I]; + if (Mask > V1EltNum) { + ExtV = V2; + Mask -= V1EltNum; + } + // Any value type smaller than i32 is illegal in AArch64, and this lower + // function is called after legalize pass, so we need to legalize + // the result here. + EVT EltVT = MVT::i32; + if(EltSize == 64) + EltVT = MVT::i64; + PassN = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ExtV, + DAG.getConstant(Mask, MVT::i64)); + PassN = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, InsV, PassN, + DAG.getConstant(InsIndex[I], MVT::i64)); + } + return PassN; } return SDValue(); Index: lib/Target/AArch64/AArch64InstrNEON.td =================================================================== --- lib/Target/AArch64/AArch64InstrNEON.td +++ lib/Target/AArch64/AArch64InstrNEON.td @@ -41,14 +41,13 @@ def Neon_tst : SDNode<"AArch64ISD::NEON_TST", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<1, 2>]>>; -def Neon_dupImm : SDNode<"AArch64ISD::NEON_DUPIMM", SDTypeProfile<1, 1, - [SDTCisVec<0>, SDTCisVT<1, i32>]>>; - def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisVT<2, i32>]>; def Neon_sqrshlImm : SDNode<"AArch64ISD::NEON_QSHLs", SDTARMVSH>; def Neon_uqrshlImm : SDNode<"AArch64ISD::NEON_QSHLu", SDTARMVSH>; +def Neon_vdup : SDNode<"AArch64ISD::NEON_VDUP", SDTypeProfile<1, 1, + [SDTCisVec<0>]>>; def Neon_vduplane : SDNode<"AArch64ISD::NEON_VDUPLANE", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisVT<2, i64>]>>; @@ -1480,7 +1479,7 @@ asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm", [(set (Ty VPRC:$Rd), (Ty (OpNode (Ty VPRC:$Rn), - (Ty (Neon_dupImm (i32 imm:$Imm))))))], + (Ty (Neon_vdup (i32 imm:$Imm))))))], NoItinerary>; multiclass NeonI_N2VShL opcode, string asmop> { @@ -1585,7 +1584,7 @@ [(set (DestTy VPR128:$Rd), (DestTy (shl (DestTy (ExtOp (SrcTy VPR64:$Rn))), - (DestTy (Neon_dupImm (i32 imm:$Imm))))))], + (DestTy (Neon_vdup (i32 imm:$Imm))))))], NoItinerary>; class N2VShiftLongHigh opcode, string asmop, string DestT, @@ -1599,7 +1598,7 @@ (DestTy (shl (DestTy (ExtOp (SrcTy (getTop VPR128:$Rn)))), - (DestTy (Neon_dupImm (i32 imm:$Imm))))))], + (DestTy (Neon_vdup (i32 imm:$Imm))))))], NoItinerary>; multiclass NeonI_N2VShLL opcode, string asmop, @@ -1771,7 +1770,7 @@ asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm", [(set (Ty VPRC:$Rd), (Ty (add (Ty VPRC:$src), (Ty (OpNode (Ty VPRC:$Rn), - (Ty (Neon_dupImm (i32 imm:$Imm))))))))], + (Ty (Neon_vdup (i32 imm:$Imm))))))))], NoItinerary> { let Constraints = "$src = $Rd"; } @@ -2048,48 +2047,48 @@ def Neon_lshrImm8H : PatFrag<(ops node:$lhs, node:$rhs), (v8i16 (srl (v8i16 node:$lhs), - (v8i16 (Neon_dupImm (i32 node:$rhs)))))>; + (v8i16 (Neon_vdup (i32 node:$rhs)))))>; def Neon_lshrImm4S : PatFrag<(ops node:$lhs, node:$rhs), (v4i32 (srl (v4i32 node:$lhs), - (v4i32 (Neon_dupImm (i32 node:$rhs)))))>; + (v4i32 (Neon_vdup (i32 node:$rhs)))))>; def Neon_lshrImm2D : PatFrag<(ops node:$lhs, node:$rhs), (v2i64 (srl (v2i64 node:$lhs), - (v2i64 (Neon_dupImm (i32 node:$rhs)))))>; + (v2i64 (Neon_vdup (i32 node:$rhs)))))>; def Neon_ashrImm8H : PatFrag<(ops node:$lhs, node:$rhs), (v8i16 (sra (v8i16 node:$lhs), - (v8i16 (Neon_dupImm (i32 node:$rhs)))))>; + (v8i16 (Neon_vdup (i32 node:$rhs)))))>; def Neon_ashrImm4S : PatFrag<(ops node:$lhs, node:$rhs), (v4i32 (sra (v4i32 node:$lhs), - (v4i32 (Neon_dupImm (i32 node:$rhs)))))>; + (v4i32 (Neon_vdup (i32 node:$rhs)))))>; def Neon_ashrImm2D : PatFrag<(ops node:$lhs, node:$rhs), (v2i64 (sra (v2i64 node:$lhs), - (v2i64 (Neon_dupImm (i32 node:$rhs)))))>; + (v2i64 (Neon_vdup (i32 node:$rhs)))))>; // Normal shift right narrow is matched by IR (srl/sra, trunc, concat_vectors) multiclass Neon_shiftNarrow_patterns { def : Pat<(v8i8 (trunc (!cast("Neon_" # shr # "Imm8H") VPR128:$Rn, - imm:$Imm))), + (i32 imm:$Imm)))), (SHRNvvi_8B VPR128:$Rn, imm:$Imm)>; def : Pat<(v4i16 (trunc (!cast("Neon_" # shr # "Imm4S") VPR128:$Rn, - imm:$Imm))), + (i32 imm:$Imm)))), (SHRNvvi_4H VPR128:$Rn, imm:$Imm)>; def : Pat<(v2i32 (trunc (!cast("Neon_" # shr # "Imm2D") VPR128:$Rn, - imm:$Imm))), + (i32 imm:$Imm)))), (SHRNvvi_2S VPR128:$Rn, imm:$Imm)>; def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert (v8i8 (trunc (!cast("Neon_" # shr # "Imm8H") - VPR128:$Rn, imm:$Imm)))))), - (SHRNvvi_16B (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), + VPR128:$Rn, (i32 imm:$Imm))))))), + (SHRNvvi_16B (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)), VPR128:$Rn, imm:$Imm)>; def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert (v4i16 (trunc (!cast("Neon_" # shr # "Imm4S") - VPR128:$Rn, imm:$Imm)))))), + VPR128:$Rn, (i32 imm:$Imm))))))), (SHRNvvi_8H (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), VPR128:$Rn, imm:$Imm)>; def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert (v2i32 (trunc (!cast("Neon_" # shr # "Imm2D") - VPR128:$Rn, imm:$Imm)))))), + VPR128:$Rn, (i32 imm:$Imm))))))), (SHRNvvi_4S (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), VPR128:$Rn, imm:$Imm)>; } @@ -2486,13 +2485,13 @@ { def _8h : PatFrag<(ops node:$Rn), (v8i8 (trunc (v8i16 (srl (v8i16 node:$Rn), - (v8i16 (Neon_dupImm 8))))))>; + (v8i16 (Neon_vdup (i32 8)))))))>; def _4s : PatFrag<(ops node:$Rn), (v4i16 (trunc (v4i32 (srl (v4i32 node:$Rn), - (v4i32 (Neon_dupImm 16))))))>; + (v4i32 (Neon_vdup (i32 16)))))))>; def _2d : PatFrag<(ops node:$Rn), (v2i32 (trunc (v2i64 (srl (v2i64 node:$Rn), - (v2i64 (Neon_dupImm 32))))))>; + (v2i64 (Neon_vdup (i32 32)))))))>; } defm NI_get_hi : NeonI_get_high; @@ -4327,6 +4326,46 @@ // bits 11-13 are unspecified. } +multiclass Neon_INS_elt_float_pattern { +def : Pat<(ResTy (vector_insert + (ResTy VPR128:$src), + (MidTy (vector_extract + (ResTy VPR128:$Rn), + (ResImm:$Immn))), + (ResImm:$Immd))), + (INS (ResTy VPR128:$src), (ResTy VPR128:$Rn), + ResImm:$Immd, ResImm:$Immn)>; + +def : Pat <(ResTy (vector_insert + (ResTy VPR128:$src), + (MidTy OpFPR:$Rn), + (ResImm:$Imm))), + (INS (ResTy VPR128:$src), + (ResTy (SUBREG_TO_REG (i64 0), OpFPR:$Rn, SubIndex)), + ResImm:$Imm, + (i64 0))>; + +def : Pat <(NaTy (vector_insert + (NaTy VPR64:$src), + (MidTy OpFPR:$Rn), + (ResImm:$Imm))), + (NaTy (EXTRACT_SUBREG + (ResTy (INS + (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)), + (ResTy (SUBREG_TO_REG (i64 0), (MidTy OpFPR:$Rn), SubIndex)), + ResImm:$Imm, + (i64 0))), + sub_64))>; +} + +defm : Neon_INS_elt_float_pattern; +defm : Neon_INS_elt_float_pattern; + multiclass Neon_INS_elt_pattern { @@ -4371,14 +4410,15 @@ sub_64))>; } -defm INSb_pattern : Neon_INS_elt_pattern; -defm INSh_pattern : Neon_INS_elt_pattern; -defm INSs_pattern : Neon_INS_elt_pattern; -defm INSd_pattern : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; + class NeonI_SMOV; } -defm SMOVxb_pattern : Neon_SMOVx_pattern; -defm SMOVxh_pattern : Neon_SMOVx_pattern; -defm SMOVxs_pattern : Neon_SMOVx_pattern; +defm : Neon_SMOVx_pattern; +defm : Neon_SMOVx_pattern; +defm : Neon_SMOVx_pattern; class Neon_SMOVw_pattern ; -def SMOVwb_pattern : Neon_SMOVw_pattern; -def SMOVwh_pattern : Neon_SMOVw_pattern; - +def : Neon_SMOVw_pattern; +def : Neon_SMOVw_pattern; class NeonI_UMOV; -def UMOVwb_pattern : Neon_UMOV_pattern; -def UMOVwh_pattern : Neon_UMOV_pattern; -def UMOVws_pattern : Neon_UMOV_pattern; +def : Neon_UMOV_pattern; +def : Neon_UMOV_pattern; +def : Neon_UMOV_pattern; def : Pat<(i32 (and (i32 (vector_extract @@ -4600,4 +4639,179 @@ def : Pat<(v1f32 (scalar_to_vector (f32 FPR32:$Rn))), (v1f32 FPR32:$Rn)>; def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Rn))), - (v1f64 FPR64:$Rn)>; \ No newline at end of file + (v1f64 FPR64:$Rn)>; + +def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$src))), + (FMOVdd $src)>; + +class NeonI_DUP_Elt + : NeonI_copy { + bits<4> Imm; +} + +def DUPELT16b : NeonI_DUP_Elt<0b1, "dup", ".16b", ".b", VPR128, v16i8, v16i8, + neon_uimm4_bare> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} + +def DUPELT8h : NeonI_DUP_Elt<0b1, "dup", ".8h", ".h", VPR128, v8i16, v8i16, + neon_uimm3_bare> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} + +def DUPELT4s : NeonI_DUP_Elt<0b1, "dup", ".4s", ".s", VPR128, v4i32, v4i32, + neon_uimm2_bare> { + let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; +} + +def DUPELT2d : NeonI_DUP_Elt<0b1, "dup", ".2d", ".d", VPR128, v2i64, v2i64, + neon_uimm1_bare> { + let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0}; +} + +def DUPELT8b : NeonI_DUP_Elt<0b0, "dup", ".8b", ".b", VPR64, v8i8, v16i8, + neon_uimm4_bare> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} + +def DUPELT4h : NeonI_DUP_Elt<0b0, "dup", ".4h", ".h", VPR64, v4i16, v8i16, + neon_uimm3_bare> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} + +def DUPELT2s : NeonI_DUP_Elt<0b0, "dup", ".2s", ".s", VPR64, v2i32, v4i32, + neon_uimm2_bare> { + let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; +} + +multiclass NeonI_DUP_Elt_pattern { +def : Pat<(ResTy (Neon_vduplane (OpTy VPR128:$Rn), OpLImm:$Imm)), + (ResTy (DUPELT (OpTy VPR128:$Rn), OpLImm:$Imm))>; + +def : Pat<(ResTy (Neon_vduplane + (NaTy VPR64:$Rn), OpNImm:$Imm)), + (ResTy (DUPELT + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), OpNImm:$Imm))>; +} +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; + +def : Pat<(v2f32 (Neon_vdup (f32 FPR32:$Rn))), + (v2f32 (DUPELT2s + (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32), + (i64 0)))>; +def : Pat<(v4f32 (Neon_vdup (f32 FPR32:$Rn))), + (v4f32 (DUPELT4s + (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32), + (i64 0)))>; +def : Pat<(v2f64 (Neon_vdup (f64 FPR64:$Rn))), + (v2f64 (DUPELT2d + (SUBREG_TO_REG (i64 0), FPR64:$Rn, sub_64), + (i64 0)))>; + +class NeonI_DUP + : NeonI_copy; + +def DUP16b : NeonI_DUP<0b1, "dup", ".16b", VPR128, v16i8, GPR32, i32> { + let Inst{16} = 0b1; + // bits 17-19 are unspecified. +} + +def DUP8h : NeonI_DUP<0b1, "dup", ".8h", VPR128, v8i16, GPR32, i32> { + let Inst{17-16} = 0b10; + // bits 18-19 are unspecified. +} + +def DUP4s : NeonI_DUP<0b1, "dup", ".4s", VPR128, v4i32, GPR32, i32> { + let Inst{18-16} = 0b100; + // bit 19 is unspecified. +} + +def DUP2d : NeonI_DUP<0b1, "dup", ".2d", VPR128, v2i64, GPR64, i64> { + let Inst{19-16} = 0b1000; +} + +def DUP8b : NeonI_DUP<0b0, "dup", ".8b", VPR64, v8i8, GPR32, i32> { + let Inst{16} = 0b1; + // bits 17-19 are unspecified. +} + +def DUP4h : NeonI_DUP<0b0, "dup", ".4h", VPR64, v4i16, GPR32, i32> { + let Inst{17-16} = 0b10; + // bits 18-19 are unspecified. +} + +def DUP2s : NeonI_DUP<0b0, "dup", ".2s", VPR64, v2i32, GPR32, i32> { + let Inst{18-16} = 0b100; + // bit 19 is unspecified. +} + +// patterns for CONCAT_VECTORS +multiclass Concat_Vector_Pattern { +def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), undef)), + (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)>; +def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rm))), + (INSELd + (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rm, sub_64)), + (i64 1), + (i64 0))>; +def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rn))), + (DUPELT2d + (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + (i64 0))> ; +} + +defm : Concat_Vector_Pattern; +defm : Concat_Vector_Pattern; +defm : Concat_Vector_Pattern; +defm : Concat_Vector_Pattern; +defm : Concat_Vector_Pattern; +defm : Concat_Vector_Pattern; + +//patterns for EXTRACT_SUBVECTOR +def : Pat<(v8i8 (extract_subvector (v16i8 VPR128:$Rn), (i64 0))), + (v8i8 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v4i16 (extract_subvector (v8i16 VPR128:$Rn), (i64 0))), + (v4i16 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v2i32 (extract_subvector (v4i32 VPR128:$Rn), (i64 0))), + (v2i32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v1i64 (extract_subvector (v2i64 VPR128:$Rn), (i64 0))), + (v1i64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v2f32 (extract_subvector (v4f32 VPR128:$Rn), (i64 0))), + (v2f32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v1f64 (extract_subvector (v2f64 VPR128:$Rn), (i64 0))), + (v1f64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; \ No newline at end of file Index: test/CodeGen/AArch64/neon-copy.ll =================================================================== --- test/CodeGen/AArch64/neon-copy.ll +++ test/CodeGen/AArch64/neon-copy.ll @@ -225,8 +225,196 @@ ret i64 %tmp4 } +define <8 x i8> @test_vcopy_lane_s8(<8 x i8> %v1, <8 x i8> %v2) { +;CHECK: ins {{v[0-9]+}}.b[5], {{v[0-9]+}}.b[3] + %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> + ret <8 x i8> %vset_lane +} + +define <16 x i8> @test_vcopyq_laneq_s8(<16 x i8> %v1, <16 x i8> %v2) { +;CHECK: ins {{v[0-9]+}}.b[14], {{v[0-9]+}}.b[6] + %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> + ret <16 x i8> %vset_lane +} + +define <8 x i8> @test_vcopy_lane_swap_s8(<8 x i8> %v1, <8 x i8> %v2) { +;CHECK: ins {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[0] + %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> + ret <8 x i8> %vset_lane +} + +define <16 x i8> @test_vcopyq_laneq_swap_s8(<16 x i8> %v1, <16 x i8> %v2) { +;CHECK: ins {{v[0-9]+}}.b[0], {{v[0-9]+}}.b[15] + %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> + ret <16 x i8> %vset_lane +} + +define <8 x i8> @test_vdup_n_u8(i8 %v1) #0 { +;CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}} + %vecinit.i = insertelement <8 x i8> undef, i8 %v1, i32 0 + %vecinit1.i = insertelement <8 x i8> %vecinit.i, i8 %v1, i32 1 + %vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 %v1, i32 2 + %vecinit3.i = insertelement <8 x i8> %vecinit2.i, i8 %v1, i32 3 + %vecinit4.i = insertelement <8 x i8> %vecinit3.i, i8 %v1, i32 4 + %vecinit5.i = insertelement <8 x i8> %vecinit4.i, i8 %v1, i32 5 + %vecinit6.i = insertelement <8 x i8> %vecinit5.i, i8 %v1, i32 6 + %vecinit7.i = insertelement <8 x i8> %vecinit6.i, i8 %v1, i32 7 + ret <8 x i8> %vecinit7.i +} + +define <4 x i16> @test_vdup_n_u16(i16 %v1) #0 { +;CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}} + %vecinit.i = insertelement <4 x i16> undef, i16 %v1, i32 0 + %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %v1, i32 1 + %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %v1, i32 2 + %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %v1, i32 3 + ret <4 x i16> %vecinit3.i +} + +define <2 x i32> @test_vdup_n_u32(i32 %v1) #0 { +;CHECK: dup {{v[0-9]+}}.2s, {{w[0-9]+}} + %vecinit.i = insertelement <2 x i32> undef, i32 %v1, i32 0 + %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %v1, i32 1 + ret <2 x i32> %vecinit1.i +} + +define <1 x i64> @test_vdup_n_u64(i64 %v1) #0 { +;CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}} + %vecinit.i = insertelement <1 x i64> undef, i64 %v1, i32 0 + ret <1 x i64> %vecinit.i +} +define <16 x i8> @test_vdupq_n_u8(i8 %v1) #0 { +;CHECK: dup {{v[0-9]+}}.16b, {{w[0-9]+}} + %vecinit.i = insertelement <16 x i8> undef, i8 %v1, i32 0 + %vecinit1.i = insertelement <16 x i8> %vecinit.i, i8 %v1, i32 1 + %vecinit2.i = insertelement <16 x i8> %vecinit1.i, i8 %v1, i32 2 + %vecinit3.i = insertelement <16 x i8> %vecinit2.i, i8 %v1, i32 3 + %vecinit4.i = insertelement <16 x i8> %vecinit3.i, i8 %v1, i32 4 + %vecinit5.i = insertelement <16 x i8> %vecinit4.i, i8 %v1, i32 5 + %vecinit6.i = insertelement <16 x i8> %vecinit5.i, i8 %v1, i32 6 + %vecinit7.i = insertelement <16 x i8> %vecinit6.i, i8 %v1, i32 7 + %vecinit8.i = insertelement <16 x i8> %vecinit7.i, i8 %v1, i32 8 + %vecinit9.i = insertelement <16 x i8> %vecinit8.i, i8 %v1, i32 9 + %vecinit10.i = insertelement <16 x i8> %vecinit9.i, i8 %v1, i32 10 + %vecinit11.i = insertelement <16 x i8> %vecinit10.i, i8 %v1, i32 11 + %vecinit12.i = insertelement <16 x i8> %vecinit11.i, i8 %v1, i32 12 + %vecinit13.i = insertelement <16 x i8> %vecinit12.i, i8 %v1, i32 13 + %vecinit14.i = insertelement <16 x i8> %vecinit13.i, i8 %v1, i32 14 + %vecinit15.i = insertelement <16 x i8> %vecinit14.i, i8 %v1, i32 15 + ret <16 x i8> %vecinit15.i +} +define <8 x i16> @test_vdupq_n_u16(i16 %v1) #0 { +;CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}} + %vecinit.i = insertelement <8 x i16> undef, i16 %v1, i32 0 + %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %v1, i32 1 + %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %v1, i32 2 + %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %v1, i32 3 + %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %v1, i32 4 + %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %v1, i32 5 + %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %v1, i32 6 + %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %v1, i32 7 + ret <8 x i16> %vecinit7.i +} +define <4 x i32> @test_vdupq_n_u32(i32 %v1) #0 { +;CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}} + %vecinit.i = insertelement <4 x i32> undef, i32 %v1, i32 0 + %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %v1, i32 1 + %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %v1, i32 2 + %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %v1, i32 3 + ret <4 x i32> %vecinit3.i +} + +define <2 x i64> @test_vdupq_n_u64(i64 %v1) #0 { +;CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}} + %vecinit.i = insertelement <2 x i64> undef, i64 %v1, i32 0 + %vecinit1.i = insertelement <2 x i64> %vecinit.i, i64 %v1, i32 1 + ret <2 x i64> %vecinit1.i +} +define <8 x i8> @test_vdup_lane_s8(<8 x i8> %v1) #0 { +;CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5] + %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle +} + +define <4 x i16> @test_vdup_lane_s16(<4 x i16> %v1) #0 { +;CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2] + %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle +} + +define <2 x i32> @test_vdup_lane_s32(<2 x i32> %v1) #0 { +;CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] + %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <2 x i32> + ret <2 x i32> %shuffle +} + +define <16 x i8> @test_vdupq_lane_s8(<8 x i8> %v1) #0 { +;CHECK: {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5] + %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <16 x i32> + ret <16 x i8> %shuffle +} + +define <8 x i16> @test_vdupq_lane_s16(<4 x i16> %v1) #0 { +;CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2] + %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <8 x i32> + ret <8 x i16> %shuffle +} + +define <4 x i32> @test_vdupq_lane_s32(<2 x i32> %v1) #0 { +;CHECK: {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] + %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <4 x i32> + ret <4 x i32> %shuffle +} + +define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %v1) #0 { +;CHECK: {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] + %shuffle = shufflevector <1 x i64> %v1, <1 x i64> undef, <2 x i32> zeroinitializer + ret <2 x i64> %shuffle +} + +define <8 x i8> @test_vdup_laneq_s8(<16 x i8> %v1) #0 { +;CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5] + %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle +} + +define <4 x i16> @test_vdup_laneq_s16(<8 x i16> %v1) #0 { +;CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2] + %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle +} + +define <2 x i32> @test_vdup_laneq_s32(<4 x i32> %v1) #0 { +;CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] + %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> + ret <2 x i32> %shuffle +} + +define <16 x i8> @test_vdupq_laneq_s8(<16 x i8> %v1) #0 { +;CHECK: dup {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5] + %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> + ret <16 x i8> %shuffle +} + +define <8 x i16> @test_vdupq_laneq_s16(<8 x i16> %v1) #0 { +;CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2] + %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> + ret <8 x i16> %shuffle +} + +define <4 x i32> @test_vdupq_laneq_s32(<4 x i32> %v1) #0 { +;CHECK: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] + %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <4 x i32> + ret <4 x i32> %shuffle +} + +define <2 x i64> @test_vdupq_laneq_s64(<2 x i64> %v1) #0 { +;CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] + %shuffle = shufflevector <2 x i64> %v1, <2 x i64> undef, <2 x i32> zeroinitializer + ret <2 x i64> %shuffle +} Index: test/MC/AArch64/neon-diagnostics.s =================================================================== --- test/MC/AArch64/neon-diagnostics.s +++ test/MC/AArch64/neon-diagnostics.s @@ -3839,3 +3839,187 @@ // CHECK-ERROR: error: invalid operand for instruction // CHECK-ERROR: frsqrts d8, s22, d18 // CHECK-ERROR: ^ + + ins v2.b[16], w1 + ins v7.h[8], w14 + ins v20.s[5], w30 + ins v1.d[2], x7 + ins v2.b[3], b1 + ins v7.h[2], h14 + ins v20.s[1], s30 + ins v1.d[0], d7 + +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: ins v2.b[16], w1 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: ins v7.h[8], w14 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: ins v20.s[5], w30 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: ins v1.d[2], x7 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: ins v2.b[3], b1 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: ins v7.h[2], h14 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: ins v20.s[1], s30 +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: ins v1.d[0], d7 +// CHECK-ERROR: ^ + + smov w1, v0.b[16] + smov w14, v6.h[8] + smov x1, v0.b[16] + smov x14, v6.h[8] + smov x20, v9.s[5] + smov w1, v0.d[0] + smov w14, v6.d[1] + smov x1, v0.d[0] + smov x14, v6.d[1] + smov x20, v9.d[0] + +// CHECK-ERROR error: lane number incompatible with layout +// CHECK-ERROR smov w1, v0.b[16] +// CHECK-ERROR ^ +// CHECK-ERROR error: lane number incompatible with layout +// CHECK-ERROR smov w14, v6.h[8] +// CHECK-ERROR ^ +// CHECK-ERROR error: lane number incompatible with layout +// CHECK-ERROR smov x1, v0.b[16] +// CHECK-ERROR ^ +// CHECK-ERROR error: lane number incompatible with layout +// CHECK-ERROR smov x14, v6.h[8] +// CHECK-ERROR ^ +// CHECK-ERROR error: lane number incompatible with layout +// CHECK-ERROR smov x20, v9.s[5] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR smov w1, v0.d[0] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR smov w14, v6.d[1] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR smov x1, v0.d[0] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR smov x14, v6.d[1] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR smov x20, v9.d[0] +// CHECK-ERROR ^ + + umov w1, v0.b[16] + umov w14, v6.h[8] + umov w20, v9.s[5] + umov x7, v18.d[3] + umov w1, v0.d[0] + umov s20, v9.s[2] + umov d7, v18.d[1] + +// CHECK-ERROR error: lane number incompatible with layout +// CHECK-ERROR umov w1, v0.b[16] +// CHECK-ERROR ^ +// CHECK-ERROR error: lane number incompatible with layout +// CHECK-ERROR umov w14, v6.h[8] +// CHECK-ERROR ^ +// CHECK-ERROR error: lane number incompatible with layout +// CHECK-ERROR umov w20, v9.s[5] +// CHECK-ERROR ^ +// CHECK-ERROR error: lane number incompatible with layout +// CHECK-ERROR umov x7, v18.d[3] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR umov w1, v0.d[0] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR umov s20, v9.s[2] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR umov d7, v18.d[1] +// CHECK-ERROR ^ + + Ins v1.h[2], v3.b[6] + Ins v6.h[7], v7.s[2] + Ins v15.d[0], v22.s[2] + Ins v0.d[0], v4.b[1] + +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR Ins v1.h[2], v3.b[6] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR Ins v6.h[7], v7.s[2] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR Ins v15.d[0], v22.s[2] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR Ins v0.d[0], v4.b[1] +// CHECK-ERROR ^ + + dup v1.8h, v2.b[2] + dup v11.4s, v7.h[7] + dup v17.2d, v20.s[0] + dup v1.16b, v2.h[2] + dup v11.8h, v7.s[3] + dup v17.4s, v20.d[0] + dup v5.2d, v1.b[1] + +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR dup v1.8h, v2.b[2] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR dup v11.4s, v7.h[7] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR dup v17.2d, v20.s[0] +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR dup v1.16b, v2.h[2] +// CHECK-ERROR ^ +// CHECK-ERROR invalid operand for instruction +// CHECK-ERROR dup v11.8h, v7.s[3] +// CHECK-ERROR ^ +// CHECK-ERROR invalid operand for instruction +// CHECK-ERROR dup v17.4s, v20.d[0] +// CHECK-ERROR ^ +// CHECK-ERROR invalid operand for instruction +// CHECK-ERROR dup v5.2d, v1.b[1] +// CHECK-ERROR ^ + + dup v1.8b, b1 + dup v11.4h, h14 + dup v17.2s, s30 + dup v1.16b, d2 + dup v11.8s, w16 + dup v17.4d, w28 + dup v5.2d, w0 + +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR dup v1.8b, b1 +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR dup v11.4h, h14 +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR dup v17.2s, s30 +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR dup v1.16b, d2 +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR dup v11.8s, w16 +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR dup v17.4d, w28 +// CHECK-ERROR ^ +// CHECK-ERROR error: invalid operand for instruction +// CHECK-ERROR dup v5.2d, w0 +// CHECK-ERROR ^ Index: test/MC/AArch64/neon-simd-copy.s =================================================================== --- test/MC/AArch64/neon-simd-copy.s +++ test/MC/AArch64/neon-simd-copy.s @@ -60,6 +60,44 @@ // CHECK: ins v15.s[3], v22.s[2] // encoding: [0xcf,0x5e,0x1c,0x6e] // CHECK: ins v0.d[0], v4.d[1] // encoding: [0x80,0x44,0x08,0x6e] +//------------------------------------------------------------------------------ +// Duplicate to all lanes( vector, from element) +//------------------------------------------------------------------------------ + dup v1.8b, v2.b[2] + dup v11.4h, v7.h[7] + dup v17.2s, v20.s[0] + dup v1.16b, v2.b[2] + dup v11.8h, v7.h[7] + dup v17.4s, v20.s[0] + dup v5.2d, v1.d[1] + +// CHECK: dup v1.8b, v2.b[2] // encoding: [0x41,0x04,0x05,0x0e] +// CHECK: dup v11.4h, v7.h[7] // encoding: [0xeb,0x04,0x1e,0x0e] +// CHECK: dup v17.2s, v20.s[0] // encoding: [0x91,0x06,0x04,0x0e] +// CHECK: dup v1.16b, v2.b[2] // encoding: [0x41,0x04,0x05,0x4e] +// CHECK: dup v11.8h, v7.h[7] // encoding: [0xeb,0x04,0x1e,0x4e] +// CHECK: dup v17.4s, v20.s[0] // encoding: [0x91,0x06,0x04,0x4e] +// CHECK: dup v5.2d, v1.d[1] // encoding: [0x25,0x04,0x18,0x4e] + +//------------------------------------------------------------------------------ +// Duplicate to all lanes( vector, from main) +//------------------------------------------------------------------------------ + dup v1.8b, w1 + dup v11.4h, w14 + dup v17.2s, w30 + dup v1.16b, w2 + dup v11.8h, w16 + dup v17.4s, w28 + dup v5.2d, x0 + +// CHECK: dup v1.8b, w1 // encoding: [0x21,0x0c,0x01,0x0e] +// CHECK: dup v11.4h, w14 // encoding: [0xcb,0x0d,0x0a,0x0e] +// CHECK: dup v17.2s, w30 // encoding: [0xd1,0x0f,0x14,0x0e] +// CHECK: dup v1.16b, w2 // encoding: [0x41,0x0c,0x01,0x4e] +// CHECK: dup v11.8h, w16 // encoding: [0x0b,0x0e,0x0a,0x4e] +// CHECK: dup v17.4s, w28 // encoding: [0x91,0x0f,0x14,0x4e] +// CHECK: dup v5.2d, x0 // encoding: [0x05,0x0c,0x08,0x4e] +