Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -139,7 +139,10 @@ // Vector saturating shift NEON_QSHLs, - NEON_QSHLu + NEON_QSHLu, + + // Vector dup by lane + NEON_VDUPLANE }; } @@ -179,6 +182,8 @@ SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST) const; + SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; + void SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL, SDValue &Chain) const; Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -297,7 +297,20 @@ setOperationAction(ISD::BUILD_VECTOR, MVT::v1f64, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i32, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1f64, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); + + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Legal); setOperationAction(ISD::SETCC, MVT::v8i8, Custom); setOperationAction(ISD::SETCC, MVT::v16i8, Custom); @@ -856,6 +869,8 @@ return "AArch64ISD::NEON_QSHLs"; case AArch64ISD::NEON_QSHLu: return "AArch64ISD::NEON_QSHLu"; + case AArch64ISD::NEON_VDUPLANE: + return "AArch64ISD::NEON_VDUPLANE"; default: return NULL; } @@ -2687,6 +2702,7 @@ case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, getSubtarget()); + case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); } return SDValue(); @@ -3476,6 +3492,35 @@ return SDValue(); } +SDValue +AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, + SelectionDAG &DAG) const { + SDValue V1 = Op.getOperand(0); + SDLoc dl(Op); + EVT VT = Op.getValueType(); + ShuffleVectorSDNode *SVN = cast(Op.getNode()); + + // Convert shuffles that are directly supported on NEON to target-specific + // DAG nodes, instead of keeping them as shuffles and matching them again + // during code selection. This is more efficient and avoids the possibility + // of inconsistencies between legalization and selection. + ArrayRef ShuffleMask = SVN->getMask(); + + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + if (EltSize <= 64) { + if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) { + int Lane = SVN->getSplatIndex(); + // If this is undef splat, generate it via "just" vdup, if possible. + if (Lane == -1) Lane = 0; + + return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1, + DAG.getConstant(Lane, MVT::i64)); + } + } + + return SDValue(); +} + AArch64TargetLowering::ConstraintType AArch64TargetLowering::getConstraintType(const std::string &Constraint) const { if (Constraint.size() == 1) { Index: lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- lib/Target/AArch64/AArch64InstrFormats.td +++ lib/Target/AArch64/AArch64InstrFormats.td @@ -975,15 +975,14 @@ class NeonI_3VSame size, bits<5> opcode, dag outs, dag ins, string asmstr, list patterns, InstrItinClass itin> - : A64InstRdnm -{ + : A64InstRdnm { let Inst{31} = 0b0; let Inst{30} = q; let Inst{29} = u; let Inst{28-24} = 0b01110; let Inst{23-22} = size; let Inst{21} = 0b1; - // Inherit Rm in 20-16 + // Inherit Rm in 20-16 let Inst{15-11} = opcode; let Inst{10} = 0b1; // Inherit Rn in 9-5 @@ -994,15 +993,14 @@ class NeonI_3VDiff size, bits<4> opcode, dag outs, dag ins, string asmstr, list patterns, InstrItinClass itin> - : A64InstRdnm -{ + : A64InstRdnm { let Inst{31} = 0b0; let Inst{30} = q; let Inst{29} = u; let Inst{28-24} = 0b01110; let Inst{23-22} = size; let Inst{21} = 0b1; - // Inherit Rm in 20-16 + // Inherit Rm in 20-16 let Inst{15-12} = opcode; let Inst{11} = 0b0; let Inst{10} = 0b0; @@ -1010,12 +1008,31 @@ // Inherit Rd in 4-0 } +// Format AdvSIMD two registers and an element +class NeonI_2VElem size, bits<4> opcode, + dag outs, dag ins, string asmstr, + list patterns, InstrItinClass itin> + : A64InstRdnm { + let Inst{31} = 0b0; + let Inst{30} = q; + let Inst{29} = u; + let Inst{28-24} = 0b01111; + let Inst{23-22} = size; + // l in Inst{21} + // m in Inst{20} + // Inherit Rm in 19-16 + let Inst{15-12} = opcode; + // h in Inst{11} + let Inst{10} = 0b0; + // Inherit Rn in 9-5 + // Inherit Rd in 4-0 +} + // Format AdvSIMD 1 vector register with modified immediate class NeonI_1VModImm patterns, InstrItinClass itin> - : A64InstRd -{ + : A64InstRd { bits<8> Imm; bits<4> cmode; let Inst{31} = 0b0; @@ -1035,15 +1052,14 @@ class NeonI_Scalar3Same size, bits<5> opcode, dag outs, dag ins, string asmstr, list patterns, InstrItinClass itin> - : A64InstRdnm -{ + : A64InstRdnm { let Inst{31} = 0b0; let Inst{30} = 0b1; let Inst{29} = u; let Inst{28-24} = 0b11110; let Inst{23-22} = size; let Inst{21} = 0b1; - // Inherit Rm in 20-16 + // Inherit Rm in 20-16 let Inst{15-11} = opcode; let Inst{10} = 0b1; // Inherit Rn in 9-5 @@ -1055,8 +1071,7 @@ class NeonI_2VMisc size, bits<5> opcode, dag outs, dag ins, string asmstr, list patterns, InstrItinClass itin> - : A64InstRdn -{ + : A64InstRdn { let Inst{31} = 0b0; let Inst{30} = q; let Inst{29} = u; @@ -1092,8 +1107,7 @@ class NeonI_copy imm4, dag outs, dag ins, string asmstr, list patterns, InstrItinClass itin> - : A64InstRdn -{ + : A64InstRdn { bits<5> Imm5; let Inst{31} = 0b0; let Inst{30} = q; @@ -1111,8 +1125,7 @@ class NeonI_insert patterns, InstrItinClass itin> - : A64InstRdn -{ + : A64InstRdn { bits<5> Imm5; bits<4> Imm4; let Inst{31} = 0b0; Index: lib/Target/AArch64/AArch64InstrNEON.td =================================================================== --- lib/Target/AArch64/AArch64InstrNEON.td +++ lib/Target/AArch64/AArch64InstrNEON.td @@ -49,6 +49,8 @@ def Neon_sqrshlImm : SDNode<"AArch64ISD::NEON_QSHLs", SDTARMVSH>; def Neon_uqrshlImm : SDNode<"AArch64ISD::NEON_QSHLu", SDTARMVSH>; +def Neon_vduplane : SDNode<"AArch64ISD::NEON_VDUPLANE", SDTypeProfile<1, 2, + [SDTCisVec<0>, SDTCisVec<1>, SDTCisVT<2, i64>]>>; //===----------------------------------------------------------------------===// // Multiclasses @@ -1557,12 +1559,22 @@ defm SSHRvvi : NeonI_N2VShR<0b0, 0b00000, "sshr", sra>; defm USHRvvi : NeonI_N2VShR<0b1, 0b00000, "ushr", srl>; -def Neon_top16B : PatFrag<(ops node:$in), - (extract_subvector (v16i8 node:$in), (iPTR 8))>; -def Neon_top8H : PatFrag<(ops node:$in), - (extract_subvector (v8i16 node:$in), (iPTR 4))>; -def Neon_top4S : PatFrag<(ops node:$in), - (extract_subvector (v4i32 node:$in), (iPTR 2))>; +def Neon_High16B : PatFrag<(ops node:$in), + (extract_subvector (v16i8 node:$in), (iPTR 8))>; +def Neon_High8H : PatFrag<(ops node:$in), + (extract_subvector (v8i16 node:$in), (iPTR 4))>; +def Neon_High4S : PatFrag<(ops node:$in), + (extract_subvector (v4i32 node:$in), (iPTR 2))>; + +def Neon_low8H : PatFrag<(ops node:$in), + (v4i16 (extract_subvector (v8i16 node:$in), + (iPTR 0)))>; +def Neon_low4S : PatFrag<(ops node:$in), + (v2i32 (extract_subvector (v4i32 node:$in), + (iPTR 0)))>; +def Neon_low4f : PatFrag<(ops node:$in), + (v2f32 (extract_subvector (v4f32 node:$in), + (iPTR 0)))>; class N2VShiftLong opcode, string asmop, string DestT, string SrcT, ValueType DestTy, ValueType SrcTy, @@ -1610,17 +1622,17 @@ // 128-bit vector types def _16B : N2VShiftLongHigh<0b1, u, opcode, asmop, "8h", "16b", - v8i16, v8i8, 8, uimm3, ExtOp, Neon_top16B> { + v8i16, v8i8, 8, uimm3, ExtOp, Neon_High16B> { let Inst{22-19} = 0b0001; // immh:immb = 0001xxx } def _8H : N2VShiftLongHigh<0b1, u, opcode, asmop, "4s", "8h", - v4i32, v4i16, 4, uimm4, ExtOp, Neon_top8H> { + v4i32, v4i16, 4, uimm4, ExtOp, Neon_High8H> { let Inst{22-20} = 0b001; // immh:immb = 001xxxx } def _4S : N2VShiftLongHigh<0b1, u, opcode, asmop, "2d", "4s", - v2i64, v2i32, 2, uimm5, ExtOp, Neon_top4S> { + v2i64, v2i32, 2, uimm5, ExtOp, Neon_High4S> { let Inst{22-21} = 0b01; // immh:immb = 01xxxxx } @@ -1634,13 +1646,13 @@ def : Pat<(v2i64 (ExtOp (v2i32 VPR64:$Rn))), (!cast(prefix # "_2S") VPR64:$Rn, 0)>; - def : Pat<(v8i16 (ExtOp (v8i8 (Neon_top16B VPR128:$Rn)))), + def : Pat<(v8i16 (ExtOp (v8i8 (Neon_High16B VPR128:$Rn)))), (!cast(prefix # "_16B") VPR128:$Rn, 0)>; - def : Pat<(v4i32 (ExtOp (v4i16 (Neon_top8H VPR128:$Rn)))), + def : Pat<(v4i32 (ExtOp (v4i16 (Neon_High8H VPR128:$Rn)))), (!cast(prefix # "_8H") VPR128:$Rn, 0)>; - def : Pat<(v2i64 (ExtOp (v2i32 (Neon_top4S VPR128:$Rn)))), + def : Pat<(v2i64 (ExtOp (v2i32 (Neon_High4S VPR128:$Rn)))), (!cast(prefix # "_4S") VPR128:$Rn, 0)>; } @@ -2018,9 +2030,21 @@ defm SQRSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10011, "sqrshrn">; defm UQRSHRNvvi : NeonI_N2VShR_Narrow<0b1, 0b10011, "uqrshrn">; -def Neon_combine : PatFrag<(ops node:$Rm, node:$Rn), - (v2i64 (concat_vectors (v1i64 node:$Rm), - (v1i64 node:$Rn)))>; +def Neon_combine_2D : PatFrag<(ops node:$Rm, node:$Rn), + (v2i64 (concat_vectors (v1i64 node:$Rm), + (v1i64 node:$Rn)))>; +def Neon_combine_8H : PatFrag<(ops node:$Rm, node:$Rn), + (v8i16 (concat_vectors (v4i16 node:$Rm), + (v4i16 node:$Rn)))>; +def Neon_combine_4S : PatFrag<(ops node:$Rm, node:$Rn), + (v4i32 (concat_vectors (v2i32 node:$Rm), + (v2i32 node:$Rn)))>; +def Neon_combine_4f : PatFrag<(ops node:$Rm, node:$Rn), + (v4f32 (concat_vectors (v2f32 node:$Rm), + (v2f32 node:$Rn)))>; +def Neon_combine_2d : PatFrag<(ops node:$Rm, node:$Rn), + (v2f64 (concat_vectors (v1f64 node:$Rm), + (v1f64 node:$Rn)))>; def Neon_lshrImm8H : PatFrag<(ops node:$lhs, node:$rhs), (v8i16 (srl (v8i16 node:$lhs), @@ -2053,17 +2077,17 @@ imm:$Imm))), (SHRNvvi_2S VPR128:$Rn, imm:$Imm)>; - def : Pat<(Neon_combine (v1i64 VPR64:$src), (v1i64 (bitconvert + def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert (v8i8 (trunc (!cast("Neon_" # shr # "Imm8H") VPR128:$Rn, imm:$Imm)))))), (SHRNvvi_16B (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), VPR128:$Rn, imm:$Imm)>; - def : Pat<(Neon_combine (v1i64 VPR64:$src), (v1i64 (bitconvert + def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert (v4i16 (trunc (!cast("Neon_" # shr # "Imm4S") VPR128:$Rn, imm:$Imm)))))), (SHRNvvi_8H (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), VPR128:$Rn, imm:$Imm)>; - def : Pat<(Neon_combine (v1i64 VPR64:$src), (v1i64 (bitconvert + def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert (v2i32 (trunc (!cast("Neon_" # shr # "Imm2D") VPR128:$Rn, imm:$Imm)))))), (SHRNvvi_4S (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), @@ -2078,17 +2102,17 @@ def : Pat<(v2i32 (op (v2i64 VPR128:$Rn), imm:$Imm)), (!cast(prefix # "_2S") VPR128:$Rn, imm:$Imm)>; - def : Pat<(Neon_combine (v1i64 VPR64:$src), + def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert (v8i8 (op (v8i16 VPR128:$Rn), imm:$Imm))))), (!cast(prefix # "_16B") (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), VPR128:$Rn, imm:$Imm)>; - def : Pat<(Neon_combine (v1i64 VPR64:$src), + def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert (v4i16 (op (v4i32 VPR128:$Rn), imm:$Imm))))), (!cast(prefix # "_8H") (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), VPR128:$Rn, imm:$Imm)>; - def : Pat<(Neon_combine (v1i64 VPR64:$src), + def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert (v2i32 (op (v2i64 VPR128:$Rn), imm:$Imm))))), (!cast(prefix # "_4S") (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), @@ -2168,11 +2192,11 @@ multiclass Neon_sshll2_0 { def _v8i8 : PatFrag<(ops node:$Rn), - (v8i16 (ext (v8i8 (Neon_top16B node:$Rn))))>; + (v8i16 (ext (v8i8 (Neon_High16B node:$Rn))))>; def _v4i16 : PatFrag<(ops node:$Rn), - (v4i32 (ext (v4i16 (Neon_top8H node:$Rn))))>; + (v4i32 (ext (v4i16 (Neon_High8H node:$Rn))))>; def _v2i32 : PatFrag<(ops node:$Rn), - (v2i64 (ext (v2i32 (Neon_top4S node:$Rn))))>; + (v2i64 (ext (v2i32 (Neon_High4S node:$Rn))))>; } defm NI_sext_high : Neon_sshll2_0; @@ -2438,7 +2462,7 @@ // part. class NarrowHighHalfPat - : Pat<(Neon_combine (v1i64 VPR64:$src), + : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert (DstTy (coreop (SrcTy VPR128:$Rn), (SrcTy VPR128:$Rm)))))), (INST (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), @@ -2504,11 +2528,11 @@ multiclass NeonI_Op_High { def _16B : PatFrag<(ops node:$Rn, node:$Rm), - (op (v8i8 (Neon_top16B node:$Rn)), (v8i8 (Neon_top16B node:$Rm)))>; + (op (v8i8 (Neon_High16B node:$Rn)), (v8i8 (Neon_High16B node:$Rm)))>; def _8H : PatFrag<(ops node:$Rn, node:$Rm), - (op (v4i16 (Neon_top8H node:$Rn)), (v4i16 (Neon_top8H node:$Rm)))>; + (op (v4i16 (Neon_High8H node:$Rn)), (v4i16 (Neon_High8H node:$Rm)))>; def _4S : PatFrag<(ops node:$Rn, node:$Rm), - (op (v2i32 (Neon_top4S node:$Rn)), (v2i32 (Neon_top4S node:$Rm)))>; + (op (v2i32 (Neon_High4S node:$Rn)), (v2i32 (Neon_High4S node:$Rm)))>; } @@ -2674,19 +2698,19 @@ opnode, v2i64, v2i32>; } -def Neon_smlal : PatFrag<(ops node:$Rd, node:$Rm, node:$Rn), +def Neon_smlal : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm), (add node:$Rd, (int_arm_neon_vmulls node:$Rn, node:$Rm))>; -def Neon_umlal : PatFrag<(ops node:$Rd, node:$Rm, node:$Rn), +def Neon_umlal : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm), (add node:$Rd, (int_arm_neon_vmullu node:$Rn, node:$Rm))>; -def Neon_smlsl : PatFrag<(ops node:$Rd, node:$Rm, node:$Rn), +def Neon_smlsl : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm), (sub node:$Rd, (int_arm_neon_vmulls node:$Rn, node:$Rm))>; -def Neon_umlsl : PatFrag<(ops node:$Rd, node:$Rm, node:$Rn), +def Neon_umlsl : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm), (sub node:$Rd, (int_arm_neon_vmullu node:$Rn, node:$Rm))>; @@ -3235,6 +3259,780 @@ let Constraints = "$src = $Rd"; } +// The followings are for instruction class (3V Elem) + +// Variant 1 + +class NI_2VE size, bits<4> opcode, + string asmop, string ResS, string OpS, string EleOpS, + Operand OpImm, RegisterOperand ResVPR, + RegisterOperand OpVPR, RegisterOperand EleOpVPR> + : NeonI_2VElem { + bits<3> Index; + bits<5> Re; + + let Constraints = "$src = $Rd"; +} + +multiclass NI_2VE_v1 opcode, string asmop> +{ + // vector register class for element is always 128-bit to cover the max index + def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", + neon_uimm2_bare, VPR64, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // Index operations on 16-bit(H) elements are restricted to using v0-v15. + def _4h8h : NI_2VE<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h", + neon_uimm3_bare, VPR64, VPR64, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } + + def _8h8h : NI_2VE<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h", + neon_uimm3_bare, VPR128, VPR128, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } +} + +defm MLAvve : NI_2VE_v1<0b1, 0b0000, "mla">; +defm MLSvve : NI_2VE_v1<0b1, 0b0100, "mls">; + +// Pattern for lane in 128-bit vector +class NI_2VE_laneq + : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn), + (OpTy (coreop (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST ResVPR:$src, OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>; + +// Pattern for lane in 64-bit vector +class NI_2VE_lane + : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn), + (OpTy (coreop (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST ResVPR:$src, OpVPR:$Rn, + (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; + +multiclass NI_2VE_v1_pat +{ + def : NI_2VE_laneq(subop # "_2s4s"), neon_uimm2_bare, + op, VPR64, VPR64, VPR128, v2i32, v2i32, v4i32, + BinOpFrag<(Neon_vduplane + (Neon_low4S node:$LHS), node:$RHS)>>; + + def : NI_2VE_laneq(subop # "_4s4s"), neon_uimm2_bare, + op, VPR128, VPR128, VPR128, v4i32, v4i32, v4i32, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VE_laneq(subop # "_4h8h"), neon_uimm3_bare, + op, VPR64, VPR64, VPR128Lo, v4i16, v4i16, v8i16, + BinOpFrag<(Neon_vduplane + (Neon_low8H node:$LHS), node:$RHS)>>; + + def : NI_2VE_laneq(subop # "_8h8h"), neon_uimm3_bare, + op, VPR128, VPR128, VPR128Lo, v8i16, v8i16, v8i16, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VE_lane(subop # "_2s4s"), neon_uimm1_bare, + op, VPR64, VPR64, VPR64, v2i32, v2i32, v2i32, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VE_lane(subop # "_4s4s"), neon_uimm1_bare, + op, VPR128, VPR128, VPR64, v4i32, v4i32, v2i32, + BinOpFrag<(Neon_vduplane + (Neon_combine_4S node:$LHS, undef), + node:$RHS)>>; + + def : NI_2VE_lane(subop # "_4h8h"), neon_uimm2_bare, + op, VPR64, VPR64, VPR64Lo, v4i16, v4i16, v4i16, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VE_lane(subop # "_8h8h"), neon_uimm2_bare, + op, VPR128, VPR128, VPR64Lo, v8i16, v8i16, v4i16, + BinOpFrag<(Neon_vduplane + (Neon_combine_8H node:$LHS, undef), + node:$RHS)>>; +} + +defm MLA_lane_v1 : NI_2VE_v1_pat<"MLAvve", Neon_mla>; +defm MLS_lane_v1 : NI_2VE_v1_pat<"MLSvve", Neon_mls>; + +class NI_2VE_2op size, bits<4> opcode, + string asmop, string ResS, string OpS, string EleOpS, + Operand OpImm, RegisterOperand ResVPR, + RegisterOperand OpVPR, RegisterOperand EleOpVPR> + : NeonI_2VElem { + bits<3> Index; + bits<5> Re; +} + +multiclass NI_2VE_v1_2op opcode, string asmop> +{ + // vector register class for element is always 128-bit to cover the max index + def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", + neon_uimm2_bare, VPR64, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // Index operations on 16-bit(H) elements are restricted to using v0-v15. + def _4h8h : NI_2VE_2op<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h", + neon_uimm3_bare, VPR64, VPR64, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } + + def _8h8h : NI_2VE_2op<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h", + neon_uimm3_bare, VPR128, VPR128, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } +} + +defm MULve : NI_2VE_v1_2op<0b0, 0b1000, "mul">; +defm SQDMULHve : NI_2VE_v1_2op<0b0, 0b1100, "sqdmulh">; +defm SQRDMULHve : NI_2VE_v1_2op<0b0, 0b1101, "sqrdmulh">; + +// Pattern for lane in 128-bit vector +class NI_2VE_mul_laneq + : Pat<(ResTy (op (OpTy OpVPR:$Rn), + (OpTy (coreop (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>; + +// Pattern for lane in 64-bit vector +class NI_2VE_mul_lane + : Pat<(ResTy (op (OpTy OpVPR:$Rn), + (OpTy (coreop (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST OpVPR:$Rn, + (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; + +multiclass NI_2VE_mul_v1_pat +{ + def : NI_2VE_mul_laneq(subop # "_2s4s"), neon_uimm2_bare, + op, VPR64, VPR128, v2i32, v2i32, v4i32, + BinOpFrag<(Neon_vduplane + (Neon_low4S node:$LHS), node:$RHS)>>; + + def : NI_2VE_mul_laneq(subop # "_4s4s"), neon_uimm2_bare, + op, VPR128, VPR128, v4i32, v4i32, v4i32, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VE_mul_laneq(subop # "_4h8h"), neon_uimm3_bare, + op, VPR64, VPR128Lo, v4i16, v4i16, v8i16, + BinOpFrag<(Neon_vduplane + (Neon_low8H node:$LHS), node:$RHS)>>; + + def : NI_2VE_mul_laneq(subop # "_8h8h"), neon_uimm3_bare, + op, VPR128, VPR128Lo, v8i16, v8i16, v8i16, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VE_mul_lane(subop # "_2s4s"), neon_uimm1_bare, + op, VPR64, VPR64, v2i32, v2i32, v2i32, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VE_mul_lane(subop # "_4s4s"), neon_uimm1_bare, + op, VPR128, VPR64, v4i32, v4i32, v2i32, + BinOpFrag<(Neon_vduplane + (Neon_combine_4S node:$LHS, undef), + node:$RHS)>>; + + def : NI_2VE_mul_lane(subop # "_4h8h"), neon_uimm2_bare, + op, VPR64, VPR64Lo, v4i16, v4i16, v4i16, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VE_mul_lane(subop # "_8h8h"), neon_uimm2_bare, + op, VPR128, VPR64Lo, v8i16, v8i16, v4i16, + BinOpFrag<(Neon_vduplane + (Neon_combine_8H node:$LHS, undef), + node:$RHS)>>; +} + +defm MUL_lane_v1 : NI_2VE_mul_v1_pat<"MULve", mul>; +defm SQDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQDMULHve", int_arm_neon_vqdmulh>; +defm SQRDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQRDMULHve", int_arm_neon_vqrdmulh>; + +// Variant 2 + +multiclass NI_2VE_v2_2op opcode, string asmop> +{ + // vector register class for element is always 128-bit to cover the max index + def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", + neon_uimm2_bare, VPR64, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // _1d2d doesn't exist! + + def _2d2d : NI_2VE_2op<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d", + neon_uimm1_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{0}}; + let Inst{21} = 0b0; + let Inst{20-16} = Re; + } +} + +defm FMULve : NI_2VE_v2_2op<0b0, 0b1001, "fmul">; +defm FMULXve : NI_2VE_v2_2op<0b1, 0b1001, "fmulx">; + +class NI_2VE_mul_lane_2d + : Pat<(ResTy (op (OpTy OpVPR:$Rn), + (OpTy (coreop (EleOpTy EleOpVPR:$Re), (EleOpTy EleOpVPR:$Re))))), + (INST OpVPR:$Rn, + (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), 0)>; + +multiclass NI_2VE_mul_v2_pat +{ + def : NI_2VE_mul_laneq(subop # "_2s4s"), neon_uimm2_bare, + op, VPR64, VPR128, v2f32, v2f32, v4f32, + BinOpFrag<(Neon_vduplane + (Neon_low4f node:$LHS), node:$RHS)>>; + + def : NI_2VE_mul_laneq(subop # "_4s4s"), neon_uimm2_bare, + op, VPR128, VPR128, v4f32, v4f32, v4f32, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VE_mul_laneq(subop # "_2d2d"), neon_uimm1_bare, + op, VPR128, VPR128, v2f64, v2f64, v2f64, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VE_mul_lane(subop # "_2s4s"), neon_uimm1_bare, + op, VPR64, VPR64, v2f32, v2f32, v2f32, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VE_mul_lane(subop # "_4s4s"), neon_uimm1_bare, + op, VPR128, VPR64, v4f32, v4f32, v2f32, + BinOpFrag<(Neon_vduplane + (Neon_combine_4f node:$LHS, undef), + node:$RHS)>>; + + def : NI_2VE_mul_lane_2d(subop # "_2d2d"), neon_uimm1_bare, + op, VPR128, VPR64, v2f64, v2f64, v1f64, + BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>; +} + +defm FMUL_lane_v2 : NI_2VE_mul_v2_pat<"FMULve", fmul>; +defm FMULX_lane_v2 : NI_2VE_mul_v2_pat<"FMULXve", int_aarch64_neon_vmulx>; + +// The followings are patterns using fma +// -ffp-contract=fast generates fma + +multiclass NI_2VE_v2 opcode, string asmop> +{ + // vector register class for element is always 128-bit to cover the max index + def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", + neon_uimm2_bare, VPR64, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // _1d2d doesn't exist! + + def _2d2d : NI_2VE<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d", + neon_uimm1_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{0}}; + let Inst{21} = 0b0; + let Inst{20-16} = Re; + } +} + +defm FMLAvve : NI_2VE_v2<0b0, 0b0001, "fmla">; +defm FMLSvve : NI_2VE_v2<0b0, 0b0101, "fmls">; + +// Pattern for lane in 128-bit vector +class NI_2VEswap_laneq + : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))), + (ResTy ResVPR:$src), (ResTy ResVPR:$Rn))), + (INST ResVPR:$src, ResVPR:$Rn, OpVPR:$Re, OpImm:$Index)>; + +// Pattern for lane in 64-bit vector +class NI_2VEswap_lane + : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))), + (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))), + (INST ResVPR:$src, ResVPR:$Rn, + (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), OpImm:$Index)>; + +// Pattern for lane in 64-bit vector +class NI_2VEswap_lane_2d2d + : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (OpTy OpVPR:$Re))), + (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))), + (INST ResVPR:$src, ResVPR:$Rn, + (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), 0)>; + + +multiclass NI_2VE_fma_v2_pat +{ + def : NI_2VEswap_laneq(subop # "_2s4s"), + neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32, + BinOpFrag<(Neon_vduplane + (Neon_low4f node:$LHS), node:$RHS)>>; + + def : NI_2VEswap_laneq(subop # "_4s4s"), + neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VEswap_laneq(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VEswap_lane(subop # "_2s4s"), + neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VEswap_lane(subop # "_4s4s"), + neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32, + BinOpFrag<(Neon_vduplane + (Neon_combine_4f node:$LHS, undef), + node:$RHS)>>; + + def : NI_2VEswap_lane_2d2d(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64, + BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>; +} + +defm FMLA_lane_v2_s : NI_2VE_fma_v2_pat<"FMLAvve", fma>; + +multiclass NI_2VE_fms_v2_pat +{ + def : NI_2VEswap_laneq(subop # "_2s4s"), + neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32, + BinOpFrag<(fneg (Neon_vduplane + (Neon_low4f node:$LHS), node:$RHS))>>; + + def : NI_2VEswap_laneq(subop # "_2s4s"), + neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32, + BinOpFrag<(Neon_vduplane + (Neon_low4f (fneg node:$LHS)), + node:$RHS)>>; + + def : NI_2VEswap_laneq(subop # "_4s4s"), + neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32, + BinOpFrag<(fneg (Neon_vduplane + node:$LHS, node:$RHS))>>; + + def : NI_2VEswap_laneq(subop # "_4s4s"), + neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32, + BinOpFrag<(Neon_vduplane + (fneg node:$LHS), node:$RHS)>>; + + def : NI_2VEswap_laneq(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64, + BinOpFrag<(fneg (Neon_vduplane + node:$LHS, node:$RHS))>>; + + def : NI_2VEswap_laneq(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64, + BinOpFrag<(Neon_vduplane + (fneg node:$LHS), node:$RHS)>>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VEswap_lane(subop # "_2s4s"), + neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32, + BinOpFrag<(fneg (Neon_vduplane + node:$LHS, node:$RHS))>>; + + def : NI_2VEswap_lane(subop # "_2s4s"), + neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32, + BinOpFrag<(Neon_vduplane + (fneg node:$LHS), node:$RHS)>>; + + def : NI_2VEswap_lane(subop # "_4s4s"), + neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32, + BinOpFrag<(fneg (Neon_vduplane + (Neon_combine_4f node:$LHS, undef), + node:$RHS))>>; + + def : NI_2VEswap_lane(subop # "_4s4s"), + neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32, + BinOpFrag<(Neon_vduplane + (Neon_combine_4f (fneg node:$LHS), undef), + node:$RHS)>>; + + def : NI_2VEswap_lane_2d2d(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64, + BinOpFrag<(fneg (Neon_combine_2d + node:$LHS, node:$RHS))>>; + + def : NI_2VEswap_lane_2d2d(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64, + BinOpFrag<(Neon_combine_2d + (fneg node:$LHS), (fneg node:$RHS))>>; +} + +defm FMLS_lane_v2_s : NI_2VE_fms_v2_pat<"FMLSvve", fma>; + +// Variant 3: Long type +// E.g. SMLAL : 4S/4H/H (v0-v15), 2D/2S/S +// SMLAL2: 4S/8H/H (v0-v15), 2D/4S/S + +multiclass NI_2VE_v3 opcode, string asmop> +{ + // vector register class for element is always 128-bit to cover the max index + def _2d2s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2d", "2s", "s", + neon_uimm2_bare, VPR128, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _2d4s : NI_2VE<0b1, u, 0b10, opcode, asmop # "2", "2d", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // Index operations on 16-bit(H) elements are restricted to using v0-v15. + def _4s8h : NI_2VE<0b1, u, 0b01, opcode, asmop # "2", "4s", "8h", "h", + neon_uimm3_bare, VPR128, VPR128, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } + + def _4s4h : NI_2VE<0b0, u, 0b01, opcode, asmop, "4s", "4h", "h", + neon_uimm3_bare, VPR128, VPR64, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } +} + +defm SMLALvve : NI_2VE_v3<0b0, 0b0010, "smlal">; +defm UMLALvve : NI_2VE_v3<0b1, 0b0010, "umlal">; +defm SMLSLvve : NI_2VE_v3<0b0, 0b0110, "smlsl">; +defm UMLSLvve : NI_2VE_v3<0b1, 0b0110, "umlsl">; +defm SQDMLALvve : NI_2VE_v3<0b0, 0b0011, "sqdmlal">; +defm SQDMLSLvve : NI_2VE_v3<0b0, 0b0111, "sqdmlsl">; + +multiclass NI_2VE_v3_2op opcode, string asmop> +{ + // vector register class for element is always 128-bit to cover the max index + def _2d2s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2d", "2s", "s", + neon_uimm2_bare, VPR128, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _2d4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop # "2", "2d", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // Index operations on 16-bit(H) elements are restricted to using v0-v15. + def _4s8h : NI_2VE_2op<0b1, u, 0b01, opcode, asmop # "2", "4s", "8h", "h", + neon_uimm3_bare, VPR128, VPR128, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } + + def _4s4h : NI_2VE_2op<0b0, u, 0b01, opcode, asmop, "4s", "4h", "h", + neon_uimm3_bare, VPR128, VPR64, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } +} + +defm SMULLve : NI_2VE_v3_2op<0b0, 0b1010, "smull">; +defm UMULLve : NI_2VE_v3_2op<0b1, 0b1010, "umull">; +defm SQDMULLve : NI_2VE_v3_2op<0b0, 0b1011, "sqdmull">; + +// Pattern for lane in 128-bit vector +class NI_2VEL2_laneq + : Pat<(ResTy (op (ResTy VPR128:$src), + (HalfOpTy (hiop (OpTy VPR128:$Rn))), + (HalfOpTy (coreop (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST VPR128:$src, VPR128:$Rn, EleOpVPR:$Re, OpImm:$Index)>; + +// Pattern for lane in 64-bit vector +class NI_2VEL2_lane + : Pat<(ResTy (op (ResTy VPR128:$src), + (HalfOpTy (hiop (OpTy VPR128:$Rn))), + (HalfOpTy (coreop (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST VPR128:$src, VPR128:$Rn, + (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; + +multiclass NI_2VEL_v3_pat +{ + def : NI_2VE_laneq(subop # "_4s4h"), neon_uimm3_bare, + op, VPR128, VPR64, VPR128Lo, v4i32, v4i16, v8i16, + BinOpFrag<(Neon_vduplane + (Neon_low8H node:$LHS), node:$RHS)>>; + + def : NI_2VE_laneq(subop # "_2d2s"), neon_uimm2_bare, + op, VPR128, VPR64, VPR128, v2i64, v2i32, v4i32, + BinOpFrag<(Neon_vduplane + (Neon_low4S node:$LHS), node:$RHS)>>; + + def : NI_2VEL2_laneq(subop # "_4s8h"), neon_uimm3_bare, + op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, Neon_High8H, + BinOpFrag<(Neon_vduplane + (Neon_low8H node:$LHS), node:$RHS)>>; + + def : NI_2VEL2_laneq(subop # "_2d4s"), neon_uimm2_bare, + op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S, + BinOpFrag<(Neon_vduplane + (Neon_low4S node:$LHS), node:$RHS)>>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VE_lane(subop # "_4s4h"), neon_uimm2_bare, + op, VPR128, VPR64, VPR64Lo, v4i32, v4i16, v4i16, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VE_lane(subop # "_2d2s"), neon_uimm1_bare, + op, VPR128, VPR64, VPR64, v2i64, v2i32, v2i32, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VEL2_lane(subop # "_4s8h"), neon_uimm2_bare, + op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VEL2_lane(subop # "_2d4s"), neon_uimm1_bare, + op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; +} + +defm SMLAL_lane_v3 : NI_2VEL_v3_pat<"SMLALvve", Neon_smlal>; +defm UMLAL_lane_v3 : NI_2VEL_v3_pat<"UMLALvve", Neon_umlal>; +defm SMLSL_lane_v3 : NI_2VEL_v3_pat<"SMLSLvve", Neon_smlsl>; +defm UMLSL_lane_v3 : NI_2VEL_v3_pat<"UMLSLvve", Neon_umlsl>; + +// Pattern for lane in 128-bit vector +class NI_2VEL2_mul_laneq + : Pat<(ResTy (op + (HalfOpTy (hiop (OpTy VPR128:$Rn))), + (HalfOpTy (coreop (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST VPR128:$Rn, EleOpVPR:$Re, OpImm:$Index)>; + +// Pattern for lane in 64-bit vector +class NI_2VEL2_mul_lane + : Pat<(ResTy (op + (HalfOpTy (hiop (OpTy VPR128:$Rn))), + (HalfOpTy (coreop (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST VPR128:$Rn, + (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; + +multiclass NI_2VEL_mul_v3_pat +{ + def : NI_2VE_mul_laneq(subop # "_4s4h"), neon_uimm3_bare, + op, VPR64, VPR128Lo, v4i32, v4i16, v8i16, + BinOpFrag<(Neon_vduplane + (Neon_low8H node:$LHS), node:$RHS)>>; + + def : NI_2VE_mul_laneq(subop # "_2d2s"), neon_uimm2_bare, + op, VPR64, VPR128, v2i64, v2i32, v4i32, + BinOpFrag<(Neon_vduplane + (Neon_low4S node:$LHS), node:$RHS)>>; + + def : NI_2VEL2_mul_laneq(subop # "_4s8h"), neon_uimm3_bare, + op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, + Neon_High8H, + BinOpFrag<(Neon_vduplane + (Neon_low8H node:$LHS), node:$RHS)>>; + + def : NI_2VEL2_mul_laneq(subop # "_2d4s"), neon_uimm2_bare, + op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S, + BinOpFrag<(Neon_vduplane + (Neon_low4S node:$LHS), node:$RHS)>>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VE_mul_lane(subop # "_4s4h"), neon_uimm2_bare, + op, VPR64, VPR64Lo, v4i32, v4i16, v4i16, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VE_mul_lane(subop # "_2d2s"), neon_uimm1_bare, + op, VPR64, VPR64, v2i64, v2i32, v2i32, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VEL2_mul_lane(subop # "_4s8h"), neon_uimm2_bare, + op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VEL2_mul_lane(subop # "_2d4s"), neon_uimm1_bare, + op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; +} + +defm SMULL_lane_v3 : NI_2VEL_mul_v3_pat<"SMULLve", int_arm_neon_vmulls>; +defm UMULL_lane_v3 : NI_2VEL_mul_v3_pat<"UMULLve", int_arm_neon_vmullu>; +defm SQDMULL_lane_v3 : NI_2VEL_mul_v3_pat<"SQDMULLve", int_arm_neon_vqdmull>; + +multiclass NI_qdma +{ + def _4s : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), + (op node:$Ra, + (v4i32 (int_arm_neon_vqdmull node:$Rn, node:$Rm)))>; + + def _2d : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), + (op node:$Ra, + (v2i64 (int_arm_neon_vqdmull node:$Rn, node:$Rm)))>; +} + +defm Neon_qdmlal : NI_qdma; +defm Neon_qdmlsl : NI_qdma; + +multiclass NI_2VEL_v3_qdma_pat +{ + def : NI_2VE_laneq(subop # "_4s4h"), neon_uimm3_bare, + !cast(op # "_4s"), VPR128, VPR64, VPR128Lo, + v4i32, v4i16, v8i16, + BinOpFrag<(Neon_vduplane + (Neon_low8H node:$LHS), node:$RHS)>>; + + def : NI_2VE_laneq(subop # "_2d2s"), neon_uimm2_bare, + !cast(op # "_2d"), VPR128, VPR64, VPR128, + v2i64, v2i32, v4i32, + BinOpFrag<(Neon_vduplane + (Neon_low4S node:$LHS), node:$RHS)>>; + + def : NI_2VEL2_laneq(subop # "_4s8h"), neon_uimm3_bare, + !cast(op # "_4s"), VPR128Lo, + v4i32, v8i16, v8i16, v4i16, Neon_High8H, + BinOpFrag<(Neon_vduplane + (Neon_low8H node:$LHS), node:$RHS)>>; + + def : NI_2VEL2_laneq(subop # "_2d4s"), neon_uimm2_bare, + !cast(op # "_2d"), VPR128, + v2i64, v4i32, v4i32, v2i32, Neon_High4S, + BinOpFrag<(Neon_vduplane + (Neon_low4S node:$LHS), node:$RHS)>>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VE_lane(subop # "_4s4h"), neon_uimm2_bare, + !cast(op # "_4s"), VPR128, VPR64, VPR64Lo, + v4i32, v4i16, v4i16, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VE_lane(subop # "_2d2s"), neon_uimm1_bare, + !cast(op # "_2d"), VPR128, VPR64, VPR64, + v2i64, v2i32, v2i32, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VEL2_lane(subop # "_4s8h"), neon_uimm2_bare, + !cast(op # "_4s"), VPR64Lo, + v4i32, v8i16, v4i16, v4i16, Neon_High8H, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VEL2_lane(subop # "_2d4s"), neon_uimm1_bare, + !cast(op # "_2d"), VPR64, + v2i64, v4i32, v2i32, v2i32, Neon_High4S, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; +} + +defm SQDMLAL_lane_v3 : NI_2VEL_v3_qdma_pat<"SQDMLALvve", "Neon_qdmlal">; +defm SQDMLSL_lane_v3 : NI_2VEL_v3_qdma_pat<"SQDMLSLvve", "Neon_qdmlsl">; + +// End of implementation for instruction class (3V Elem) //Insert element (vector, from main) def INSbw : NeonI_INS_main<"ins", "b", v16i8, GPR32, i32, Index: lib/Target/AArch64/AArch64RegisterInfo.td =================================================================== --- lib/Target/AArch64/AArch64RegisterInfo.td +++ lib/Target/AArch64/AArch64RegisterInfo.td @@ -145,14 +145,21 @@ (sequence "S%u", 0, 31)> { } -def FPR64 : RegisterClass<"AArch64", [f64, v2f32, v2i32, v4i16, v8i8, v1i64, v1f64], +def FPR64 : RegisterClass<"AArch64", + [f64, v2f32, v2i32, v4i16, v8i8, v1i64, v1f64], 64, (sequence "D%u", 0, 31)>; def FPR128 : RegisterClass<"AArch64", - [f128,v2f64, v2i64, v4f32, v4i32, v8i16, v16i8], 128, - (sequence "Q%u", 0, 31)>; + [f128,v2f64, v2i64, v4f32, v4i32, v8i16, v16i8], + 128, (sequence "Q%u", 0, 31)>; +def FPR64Lo : RegisterClass<"AArch64", + [f64, v2f32, v2i32, v4i16, v8i8, v1i64, v1f64], + 64, (sequence "D%u", 0, 15)>; +def FPR128Lo : RegisterClass<"AArch64", + [f128,v2f64, v2i64, v4f32, v4i32, v8i16, v16i8], + 128, (sequence "Q%u", 0, 15)>; //===----------------------------------------------------------------------===// // Vector registers: @@ -168,6 +175,10 @@ def VPR128 : RegisterOperand; +def VPR64Lo : RegisterOperand; + +def VPR128Lo : RegisterOperand; + // Flags register def NZCV : Register<"nzcv"> { let Namespace = "AArch64"; Index: lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp =================================================================== --- lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -85,6 +85,9 @@ static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeFPR128LoRegisterClass(llvm::MCInst &Inst, + unsigned RegNo, uint64_t Address, + const void *Decoder); static DecodeStatus DecodeAddrRegExtendOperand(llvm::MCInst &Inst, unsigned OptionHiS, @@ -349,6 +352,15 @@ return MCDisassembler::Success; } +static DecodeStatus +DecodeFPR128LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + if (RegNo > 15) + return MCDisassembler::Fail; + + return DecodeFPR128RegisterClass(Inst, RegNo, Address, Decoder); +} + static DecodeStatus DecodeAddrRegExtendOperand(llvm::MCInst &Inst, unsigned OptionHiS, uint64_t Address, Index: test/CodeGen/AArch64/neon-2velem.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/neon-2velem.ll @@ -0,0 +1,1334 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s + +declare <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double>, <2 x double>) + +declare <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float>, <4 x float>) + +declare <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float>, <2 x float>) + +declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) + +declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) + +declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) + +declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>) + +declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>) + +declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>) + +declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>) + +declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>) + +declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) + +declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) + +declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) + +declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) + +declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) + +declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) + +declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>) + +declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) + +declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>) + +declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmla_lane_s16: +; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %mul = mul <4 x i16> %shuffle, %b + %add = add <4 x i16> %mul, %a + ret <4 x i16> %add +} + +define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlaq_lane_s16: +; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> + %mul = mul <8 x i16> %shuffle, %b + %add = add <8 x i16> %mul, %a + ret <8 x i16> %add +} + +define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmla_lane_s32: +; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> + %mul = mul <2 x i32> %shuffle, %b + %add = add <2 x i32> %mul, %a + ret <2 x i32> %add +} + +define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlaq_lane_s32: +; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> + %mul = mul <4 x i32> %shuffle, %b + %add = add <4 x i32> %mul, %a + ret <4 x i32> %add +} + +define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmla_laneq_s16: +; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> + %mul = mul <4 x i16> %shuffle, %b + %add = add <4 x i16> %mul, %a + ret <4 x i16> %add +} + +define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlaq_laneq_s16: +; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> + %mul = mul <8 x i16> %shuffle, %b + %add = add <8 x i16> %mul, %a + ret <8 x i16> %add +} + +define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmla_laneq_s32: +; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> + %mul = mul <2 x i32> %shuffle, %b + %add = add <2 x i32> %mul, %a + ret <2 x i32> %add +} + +define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlaq_laneq_s32: +; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> + %mul = mul <4 x i32> %shuffle, %b + %add = add <4 x i32> %mul, %a + ret <4 x i32> %add +} + +define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmls_lane_s16: +; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %mul = mul <4 x i16> %shuffle, %b + %sub = sub <4 x i16> %a, %mul + ret <4 x i16> %sub +} + +define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlsq_lane_s16: +; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> + %mul = mul <8 x i16> %shuffle, %b + %sub = sub <8 x i16> %a, %mul + ret <8 x i16> %sub +} + +define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmls_lane_s32: +; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> + %mul = mul <2 x i32> %shuffle, %b + %sub = sub <2 x i32> %a, %mul + ret <2 x i32> %sub +} + +define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlsq_lane_s32: +; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> + %mul = mul <4 x i32> %shuffle, %b + %sub = sub <4 x i32> %a, %mul + ret <4 x i32> %sub +} + +define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmls_laneq_s16: +; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> + %mul = mul <4 x i16> %shuffle, %b + %sub = sub <4 x i16> %a, %mul + ret <4 x i16> %sub +} + +define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlsq_laneq_s16: +; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> + %mul = mul <8 x i16> %shuffle, %b + %sub = sub <8 x i16> %a, %mul + ret <8 x i16> %sub +} + +define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmls_laneq_s32: +; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> + %mul = mul <2 x i32> %shuffle, %b + %sub = sub <2 x i32> %a, %mul + ret <2 x i32> %sub +} + +define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlsq_laneq_s32: +; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> + %mul = mul <4 x i32> %shuffle, %b + %sub = sub <4 x i32> %a, %mul + ret <4 x i32> %sub +} + +define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) { +; CHECK: test_vmul_lane_s16: +; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %mul = mul <4 x i16> %shuffle, %a + ret <4 x i16> %mul +} + +define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) { +; CHECK: test_vmulq_lane_s16: +; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> + %mul = mul <8 x i16> %shuffle, %a + ret <8 x i16> %mul +} + +define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) { +; CHECK: test_vmul_lane_s32: +; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> + %mul = mul <2 x i32> %shuffle, %a + ret <2 x i32> %mul +} + +define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) { +; CHECK: test_vmulq_lane_s32: +; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> + %mul = mul <4 x i32> %shuffle, %a + ret <4 x i32> %mul +} + +define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) { +; CHECK: test_vmul_lane_u16: +; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %mul = mul <4 x i16> %shuffle, %a + ret <4 x i16> %mul +} + +define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) { +; CHECK: test_vmulq_lane_u16: +; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> + %mul = mul <8 x i16> %shuffle, %a + ret <8 x i16> %mul +} + +define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) { +; CHECK: test_vmul_lane_u32: +; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> + %mul = mul <2 x i32> %shuffle, %a + ret <2 x i32> %mul +} + +define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) { +; CHECK: test_vmulq_lane_u32: +; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> + %mul = mul <4 x i32> %shuffle, %a + ret <4 x i32> %mul +} + +define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) { +; CHECK: test_vmul_laneq_s16: +; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> + %mul = mul <4 x i16> %shuffle, %a + ret <4 x i16> %mul +} + +define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) { +; CHECK: test_vmulq_laneq_s16: +; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> + %mul = mul <8 x i16> %shuffle, %a + ret <8 x i16> %mul +} + +define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) { +; CHECK: test_vmul_laneq_s32: +; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> + %mul = mul <2 x i32> %shuffle, %a + ret <2 x i32> %mul +} + +define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) { +; CHECK: test_vmulq_laneq_s32: +; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> + %mul = mul <4 x i32> %shuffle, %a + ret <4 x i32> %mul +} + +define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) { +; CHECK: test_vmul_laneq_u16: +; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> + %mul = mul <4 x i16> %shuffle, %a + ret <4 x i16> %mul +} + +define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) { +; CHECK: test_vmulq_laneq_u16: +; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> + %mul = mul <8 x i16> %shuffle, %a + ret <8 x i16> %mul +} + +define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) { +; CHECK: test_vmul_laneq_u32: +; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> + %mul = mul <2 x i32> %shuffle, %a + ret <2 x i32> %mul +} + +define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) { +; CHECK: test_vmulq_laneq_u32: +; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> + %mul = mul <4 x i32> %shuffle, %a + ret <4 x i32> %mul +} + +define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) { +; CHECK: test_vfma_lane_f32: +; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> + %mul = fmul <2 x float> %shuffle, %b + %add = fadd <2 x float> %mul, %a + ret <2 x float> %add +} + +define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) { +; CHECK: test_vfmaq_lane_f32: +; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> + %mul = fmul <4 x float> %shuffle, %b + %add = fadd <4 x float> %mul, %a + ret <4 x float> %add +} + +define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) { +; CHECK: test_vfma_laneq_f32: +; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> + %mul = fmul <2 x float> %shuffle, %b + %add = fadd <2 x float> %mul, %a + ret <2 x float> %add +} + +define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) { +; CHECK: test_vfmaq_laneq_f32: +; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> + %mul = fmul <4 x float> %shuffle, %b + %add = fadd <4 x float> %mul, %a + ret <4 x float> %add +} + +define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) { +; CHECK: test_vfms_lane_f32: +; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> + %mul = fmul <2 x float> %shuffle, %b + %sub = fsub <2 x float> %a, %mul + ret <2 x float> %sub +} + +define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) { +; CHECK: test_vfmsq_lane_f32: +; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> + %mul = fmul <4 x float> %shuffle, %b + %sub = fsub <4 x float> %a, %mul + ret <4 x float> %sub +} + +define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) { +; CHECK: test_vfms_laneq_f32: +; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> + %mul = fmul <2 x float> %shuffle, %b + %sub = fsub <2 x float> %a, %mul + ret <2 x float> %sub +} + +define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) { +; CHECK: test_vfmsq_laneq_f32: +; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> + %mul = fmul <4 x float> %shuffle, %b + %sub = fsub <4 x float> %a, %mul + ret <4 x float> %sub +} + +define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) { +; CHECK: test_vfmaq_lane_f64: +; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[{{[0-9]+}}] +entry: + %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer + %mul = fmul <2 x double> %shuffle, %b + %add = fadd <2 x double> %mul, %a + ret <2 x double> %add +} + +define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) { +; CHECK: test_vfmaq_laneq_f64_0: +; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer + %mul = fmul <2 x double> %shuffle, %b + %add = fadd <2 x double> %mul, %a + ret <2 x double> %add +} + +define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) { +; CHECK: test_vfmaq_laneq_f64: +; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> + %mul = fmul <2 x double> %shuffle, %b + %add = fadd <2 x double> %mul, %a + ret <2 x double> %add +} + +define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) { +; CHECK: test_vfmsq_lane_f64: +; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[{{[0-9]+}}] +entry: + %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer + %mul = fmul <2 x double> %shuffle, %b + %sub = fsub <2 x double> %a, %mul + ret <2 x double> %sub +} + +define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) { +; CHECK: test_vfmsq_laneq_f64_0: +; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer + %mul = fmul <2 x double> %shuffle, %b + %sub = fsub <2 x double> %a, %mul + ret <2 x double> %sub +} + +define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) { +; CHECK: test_vfmsq_laneq_f64: +; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> + %mul = fmul <2 x double> %shuffle, %b + %sub = fsub <2 x double> %a, %mul + ret <2 x double> %sub +} + +define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlal_lane_s16: +; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle) #2 + %add = add <4 x i32> %vmull2.i, %a + ret <4 x i32> %add +} + +define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlal_lane_s32: +; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle) #2 + %add = add <2 x i64> %vmull2.i, %a + ret <2 x i64> %add +} + +define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlal_laneq_s16: +; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle) #2 + %add = add <4 x i32> %vmull2.i, %a + ret <4 x i32> %add +} + +define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlal_laneq_s32: +; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle) #2 + %add = add <2 x i64> %vmull2.i, %a + ret <2 x i64> %add +} + +define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlal_high_lane_s16: +; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) #2 + %add = add <4 x i32> %vmull2.i, %a + ret <4 x i32> %add +} + +define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlal_high_lane_s32: +; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) #2 + %add = add <2 x i64> %vmull2.i, %a + ret <2 x i64> %add +} + +define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlal_high_laneq_s16: +; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) #2 + %add = add <4 x i32> %vmull2.i, %a + ret <4 x i32> %add +} + +define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlal_high_laneq_s32: +; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) #2 + %add = add <2 x i64> %vmull2.i, %a + ret <2 x i64> %add +} + +define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlsl_lane_s16: +; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle) #2 + %sub = sub <4 x i32> %a, %vmull2.i + ret <4 x i32> %sub +} + +define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlsl_lane_s32: +; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle) #2 + %sub = sub <2 x i64> %a, %vmull2.i + ret <2 x i64> %sub +} + +define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlsl_laneq_s16: +; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle) #2 + %sub = sub <4 x i32> %a, %vmull2.i + ret <4 x i32> %sub +} + +define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlsl_laneq_s32: +; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle) #2 + %sub = sub <2 x i64> %a, %vmull2.i + ret <2 x i64> %sub +} + +define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlsl_high_lane_s16: +; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) #2 + %sub = sub <4 x i32> %a, %vmull2.i + ret <4 x i32> %sub +} + +define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlsl_high_lane_s32: +; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) #2 + %sub = sub <2 x i64> %a, %vmull2.i + ret <2 x i64> %sub +} + +define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlsl_high_laneq_s16: +; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) #2 + %sub = sub <4 x i32> %a, %vmull2.i + ret <4 x i32> %sub +} + +define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlsl_high_laneq_s32: +; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) #2 + %sub = sub <2 x i64> %a, %vmull2.i + ret <2 x i64> %sub +} + +define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlal_lane_u16: +; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle) #2 + %add = add <4 x i32> %vmull2.i, %a + ret <4 x i32> %add +} + +define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlal_lane_u32: +; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle) #2 + %add = add <2 x i64> %vmull2.i, %a + ret <2 x i64> %add +} + +define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlal_laneq_u16: +; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle) #2 + %add = add <4 x i32> %vmull2.i, %a + ret <4 x i32> %add +} + +define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlal_laneq_u32: +; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle) #2 + %add = add <2 x i64> %vmull2.i, %a + ret <2 x i64> %add +} + +define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlal_high_lane_u16: +; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) #2 + %add = add <4 x i32> %vmull2.i, %a + ret <4 x i32> %add +} + +define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlal_high_lane_u32: +; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) #2 + %add = add <2 x i64> %vmull2.i, %a + ret <2 x i64> %add +} + +define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlal_high_laneq_u16: +; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) #2 + %add = add <4 x i32> %vmull2.i, %a + ret <4 x i32> %add +} + +define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlal_high_laneq_u32: +; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) #2 + %add = add <2 x i64> %vmull2.i, %a + ret <2 x i64> %add +} + +define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlsl_lane_u16: +; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle) #2 + %sub = sub <4 x i32> %a, %vmull2.i + ret <4 x i32> %sub +} + +define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlsl_lane_u32: +; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle) #2 + %sub = sub <2 x i64> %a, %vmull2.i + ret <2 x i64> %sub +} + +define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlsl_laneq_u16: +; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle) #2 + %sub = sub <4 x i32> %a, %vmull2.i + ret <4 x i32> %sub +} + +define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlsl_laneq_u32: +; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle) #2 + %sub = sub <2 x i64> %a, %vmull2.i + ret <2 x i64> %sub +} + +define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { +; CHECK: test_vmlsl_high_lane_u16: +; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) #2 + %sub = sub <4 x i32> %a, %vmull2.i + ret <4 x i32> %sub +} + +define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { +; CHECK: test_vmlsl_high_lane_u32: +; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) #2 + %sub = sub <2 x i64> %a, %vmull2.i + ret <2 x i64> %sub +} + +define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { +; CHECK: test_vmlsl_high_laneq_u16: +; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) #2 + %sub = sub <4 x i32> %a, %vmull2.i + ret <4 x i32> %sub +} + +define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { +; CHECK: test_vmlsl_high_laneq_u32: +; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) #2 + %sub = sub <2 x i64> %a, %vmull2.i + ret <2 x i64> %sub +} + +define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) { +; CHECK: test_vmull_lane_s16: +; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2 + ret <4 x i32> %vmull2.i +} + +define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) { +; CHECK: test_vmull_lane_s32: +; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2 + ret <2 x i64> %vmull2.i +} + +define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) { +; CHECK: test_vmull_lane_u16: +; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2 + ret <4 x i32> %vmull2.i +} + +define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) { +; CHECK: test_vmull_lane_u32: +; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2 + ret <2 x i64> %vmull2.i +} + +define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) { +; CHECK: test_vmull_high_lane_s16: +; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) #2 + ret <4 x i32> %vmull2.i +} + +define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) { +; CHECK: test_vmull_high_lane_s32: +; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) #2 + ret <2 x i64> %vmull2.i +} + +define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) { +; CHECK: test_vmull_high_lane_u16: +; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) #2 + ret <4 x i32> %vmull2.i +} + +define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) { +; CHECK: test_vmull_high_lane_u32: +; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) #2 + ret <2 x i64> %vmull2.i +} + +define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) { +; CHECK: test_vmull_laneq_s16: +; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2 + ret <4 x i32> %vmull2.i +} + +define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) { +; CHECK: test_vmull_laneq_s32: +; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2 + ret <2 x i64> %vmull2.i +} + +define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) { +; CHECK: test_vmull_laneq_u16: +; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2 + ret <4 x i32> %vmull2.i +} + +define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) { +; CHECK: test_vmull_laneq_u32: +; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2 + ret <2 x i64> %vmull2.i +} + +define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) { +; CHECK: test_vmull_high_laneq_s16: +; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) #2 + ret <4 x i32> %vmull2.i +} + +define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) { +; CHECK: test_vmull_high_laneq_s32: +; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) #2 + ret <2 x i64> %vmull2.i +} + +define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) { +; CHECK: test_vmull_high_laneq_u16: +; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> + %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) #2 + ret <4 x i32> %vmull2.i +} + +define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) { +; CHECK: test_vmull_high_laneq_u32: +; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> + %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) #2 + ret <2 x i64> %vmull2.i +} + +define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { +; CHECK: test_vqdmlal_lane_s16: +; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) #2 + %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i) #2 + ret <4 x i32> %vqdmlal4.i +} + +define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { +; CHECK: test_vqdmlal_lane_s32: +; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> + %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) #2 + %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i) #2 + ret <2 x i64> %vqdmlal4.i +} + +define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { +; CHECK: test_vqdmlal_high_lane_s16: +; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) #2 + %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i) #2 + ret <4 x i32> %vqdmlal4.i +} + +define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { +; CHECK: test_vqdmlal_high_lane_s32: +; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> + %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) #2 + %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i) #2 + ret <2 x i64> %vqdmlal4.i +} + +define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { +; CHECK: test_vqdmlsl_lane_s16: +; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) #2 + %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i) #2 + ret <4 x i32> %vqdmlsl4.i +} + +define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { +; CHECK: test_vqdmlsl_lane_s32: +; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> + %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) #2 + %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i) #2 + ret <2 x i64> %vqdmlsl4.i +} + +define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { +; CHECK: test_vqdmlsl_high_lane_s16: +; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) #2 + %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i) #2 + ret <4 x i32> %vqdmlsl4.i +} + +define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { +; CHECK: test_vqdmlsl_high_lane_s32: +; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> + %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) #2 + %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i) #2 + ret <2 x i64> %vqdmlsl4.i +} + +define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) { +; CHECK: test_vqdmull_lane_s16: +; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2 + ret <4 x i32> %vqdmull2.i +} + +define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) { +; CHECK: test_vqdmull_lane_s32: +; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> + %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2 + ret <2 x i64> %vqdmull2.i +} + +define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) { +; CHECK: test_vqdmull_laneq_s16: +; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> + %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2 + ret <4 x i32> %vqdmull2.i +} + +define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) { +; CHECK: test_vqdmull_laneq_s32: +; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> + %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2 + ret <2 x i64> %vqdmull2.i +} + +define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) { +; CHECK: test_vqdmull_high_lane_s16: +; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) #2 + ret <4 x i32> %vqdmull2.i +} + +define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) { +; CHECK: test_vqdmull_high_lane_s32: +; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> + %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) #2 + ret <2 x i64> %vqdmull2.i +} + +define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) { +; CHECK: test_vqdmull_high_laneq_s16: +; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> + %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle) #2 + ret <4 x i32> %vqdmull2.i +} + +define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) { +; CHECK: test_vqdmull_high_laneq_s32: +; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> + %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle) #2 + ret <2 x i64> %vqdmull2.i +} + +define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) { +; CHECK: test_vqdmulh_lane_s16: +; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %vqdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) #2 + ret <4 x i16> %vqdmulh2.i +} + +define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) { +; CHECK: test_vqdmulhq_lane_s16: +; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> + %vqdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle) #2 + ret <8 x i16> %vqdmulh2.i +} + +define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) { +; CHECK: test_vqdmulh_lane_s32: +; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> + %vqdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle) #2 + ret <2 x i32> %vqdmulh2.i +} + +define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) { +; CHECK: test_vqdmulhq_lane_s32: +; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> + %vqdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle) #2 + ret <4 x i32> %vqdmulh2.i +} + +define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) { +; CHECK: test_vqrdmulh_lane_s16: +; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> + %vqrdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) #2 + ret <4 x i16> %vqrdmulh2.i +} + +define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) { +; CHECK: test_vqrdmulhq_lane_s16: +; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> + %vqrdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle) #2 + ret <8 x i16> %vqrdmulh2.i +} + +define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) { +; CHECK: test_vqrdmulh_lane_s32: +; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> + %vqrdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle) #2 + ret <2 x i32> %vqrdmulh2.i +} + +define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) { +; CHECK: test_vqrdmulhq_lane_s32: +; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> + %vqrdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle) #2 + ret <4 x i32> %vqrdmulh2.i +} + +define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) { +; CHECK: test_vmul_lane_f32: +; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> + %mul = fmul <2 x float> %shuffle, %a + ret <2 x float> %mul +} + +define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) { +; CHECK: test_vmulq_lane_f32: +; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> + %mul = fmul <4 x float> %shuffle, %a + ret <4 x float> %mul +} + +define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) { +; CHECK: test_vmulq_lane_f64: +; CHECK: mul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[{{[0-9]+}}] +entry: + %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer + %mul = fmul <2 x double> %shuffle, %a + ret <2 x double> %mul +} + +define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) { +; CHECK: test_vmul_laneq_f32: +; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> + %mul = fmul <2 x float> %shuffle, %a + ret <2 x float> %mul +} + +define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) { +; CHECK: test_vmulq_laneq_f32: +; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> + %mul = fmul <4 x float> %shuffle, %a + ret <4 x float> %mul +} + +define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) { +; CHECK: test_vmulq_laneq_f64_0: +; CHECK: mul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer + %mul = fmul <2 x double> %shuffle, %a + ret <2 x double> %mul +} + +define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) { +; CHECK: test_vmulq_laneq_f64: +; CHECK: mul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> + %mul = fmul <2 x double> %shuffle, %a + ret <2 x double> %mul +} + +define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) { +; CHECK: test_vmulx_lane_f32: +; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> + %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) #2 + ret <2 x float> %vmulx2.i +} + +define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) { +; CHECK: test_vmulxq_lane_f32: +; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> + %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) #2 + ret <4 x float> %vmulx2.i +} + +define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) { +; CHECK: test_vmulxq_lane_f64: +; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[{{[0-9]+}}] +entry: + %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer + %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) #2 + ret <2 x double> %vmulx2.i +} + +define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) { +; CHECK: test_vmulx_laneq_f32: +; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> + %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) #2 + ret <2 x float> %vmulx2.i +} + +define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) { +; CHECK: test_vmulxq_laneq_f32: +; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> + %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) #2 + ret <4 x float> %vmulx2.i +} + +define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) { +; CHECK: test_vmulxq_laneq_f64_0: +; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer + %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) #2 + ret <2 x double> %vmulx2.i +} + +define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) { +; CHECK: test_vmulxq_laneq_f64: +; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[{{[0-9]+}}] +entry: + %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> + %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) #2 + ret <2 x double> %vmulx2.i +} + Index: test/CodeGen/AArch64/neon-diagnostics.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/neon-diagnostics.ll @@ -0,0 +1,13 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) { +; CHECK: test_vfma_lane_f32: +; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +; CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +entry: + %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> + %mul = fmul <2 x float> %shuffle, %b + %add = fadd <2 x float> %mul, %a + ret <2 x float> %add +} + Index: test/MC/AArch64/neon-2velem.s =================================================================== --- /dev/null +++ test/MC/AArch64/neon-2velem.s @@ -0,0 +1,271 @@ +// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s + +// Check that the assembler can handle the documented syntax for AArch64 + +//------------------------------------------------------------------------------ +// Instructions with 2 vectors and an element +//------------------------------------------------------------------------------ + + mla v0.2s, v1.2s, v2.s[2] + mla v0.2s, v1.2s, v22.s[2] + mla v3.4s, v8.4s, v2.s[1] + mla v3.4s, v8.4s, v22.s[3] + +// CHECK: mla v0.2s, v1.2s, v2.s[2] // encoding: [0x20,0x08,0x82,0x2f] +// CHECK: mla v0.2s, v1.2s, v22.s[2] // encoding: [0x20,0x08,0x96,0x2f] +// CHECK: mla v3.4s, v8.4s, v2.s[1] // encoding: [0x03,0x01,0xa2,0x6f] +// CHECK: mla v3.4s, v8.4s, v22.s[3] // encoding: [0x03,0x09,0xb6,0x6f] + + mla v0.4h, v1.4h, v2.h[2] + mla v0.4h, v1.4h, v15.h[2] + mla v0.8h, v1.8h, v2.h[7] + mla v0.8h, v1.8h, v14.h[6] + +// CHECK: mla v0.4h, v1.4h, v2.h[2] // encoding: [0x20,0x00,0x62,0x2f] +// CHECK: mla v0.4h, v1.4h, v15.h[2] // encoding: [0x20,0x00,0x6f,0x2f] +// CHECK: mla v0.8h, v1.8h, v2.h[7] // encoding: [0x20,0x08,0x72,0x6f] +// CHECK: mla v0.8h, v1.8h, v14.h[6] // encoding: [0x20,0x08,0x6e,0x6f] + + mls v0.2s, v1.2s, v2.s[2] + mls v0.2s, v1.2s, v22.s[2] + mls v3.4s, v8.4s, v2.s[1] + mls v3.4s, v8.4s, v22.s[3] + +// CHECK: mls v0.2s, v1.2s, v2.s[2] // encoding: [0x20,0x48,0x82,0x2f] +// CHECK: mls v0.2s, v1.2s, v22.s[2] // encoding: [0x20,0x48,0x96,0x2f] +// CHECK: mls v3.4s, v8.4s, v2.s[1] // encoding: [0x03,0x41,0xa2,0x6f] +// CHECK: mls v3.4s, v8.4s, v22.s[3] // encoding: [0x03,0x49,0xb6,0x6f] + + mls v0.4h, v1.4h, v2.h[2] + mls v0.4h, v1.4h, v15.h[2] + mls v0.8h, v1.8h, v2.h[7] + mls v0.8h, v1.8h, v14.h[6] + +// CHECK: mls v0.4h, v1.4h, v2.h[2] // encoding: [0x20,0x40,0x62,0x2f] +// CHECK: mls v0.4h, v1.4h, v15.h[2] // encoding: [0x20,0x40,0x6f,0x2f] +// CHECK: mls v0.8h, v1.8h, v2.h[7] // encoding: [0x20,0x48,0x72,0x6f] +// CHECK: mls v0.8h, v1.8h, v14.h[6] // encoding: [0x20,0x48,0x6e,0x6f] + + fmla v0.2s, v1.2s, v2.s[2] + fmla v0.2s, v1.2s, v22.s[2] + fmla v3.4s, v8.4s, v2.s[1] + fmla v3.4s, v8.4s, v22.s[3] + fmla v0.2d, v1.2d, v2.d[1] + fmla v0.2d, v1.2d, v22.d[1] + +// CHECK: fmla v0.2s, v1.2s, v2.s[2] // encoding: [0x20,0x18,0x82,0x0f] +// CHECK: fmla v0.2s, v1.2s, v22.s[2] // encoding: [0x20,0x18,0x96,0x0f] +// CHECK: fmla v3.4s, v8.4s, v2.s[1] // encoding: [0x03,0x11,0xa2,0x4f] +// CHECK: fmla v3.4s, v8.4s, v22.s[3] // encoding: [0x03,0x19,0xb6,0x4f] +// CHECK: fmla v0.2d, v1.2d, v2.d[1] // encoding: [0x20,0x18,0xc2,0x4f] +// CHECK: fmla v0.2d, v1.2d, v22.d[1] // encoding: [0x20,0x18,0xd6,0x4f] + + fmls v0.2s, v1.2s, v2.s[2] + fmls v0.2s, v1.2s, v22.s[2] + fmls v3.4s, v8.4s, v2.s[1] + fmls v3.4s, v8.4s, v22.s[3] + fmls v0.2d, v1.2d, v2.d[1] + fmls v0.2d, v1.2d, v22.d[1] + +// CHECK: fmls v0.2s, v1.2s, v2.s[2] // encoding: [0x20,0x58,0x82,0x0f] +// CHECK: fmls v0.2s, v1.2s, v22.s[2] // encoding: [0x20,0x58,0x96,0x0f] +// CHECK: fmls v3.4s, v8.4s, v2.s[1] // encoding: [0x03,0x51,0xa2,0x4f] +// CHECK: fmls v3.4s, v8.4s, v22.s[3] // encoding: [0x03,0x59,0xb6,0x4f] +// CHECK: fmls v0.2d, v1.2d, v2.d[1] // encoding: [0x20,0x58,0xc2,0x4f] +// CHECK: fmls v0.2d, v1.2d, v22.d[1] // encoding: [0x20,0x58,0xd6,0x4f] + + smlal v0.4s, v1.4h, v2.h[2] + smlal v0.2d, v1.2s, v2.s[2] + smlal v0.2d, v1.2s, v22.s[2] + smlal2 v0.4s, v1.8h, v1.h[2] + smlal2 v0.2d, v1.4s, v1.s[2] + smlal2 v0.2d, v1.4s, v22.s[2] + +// CHECK: smlal v0.4s, v1.4h, v2.h[2] // encoding: [0x20,0x20,0x62,0x0f] +// CHECK: smlal v0.2d, v1.2s, v2.s[2] // encoding: [0x20,0x28,0x82,0x0f] +// CHECK: smlal v0.2d, v1.2s, v22.s[2] // encoding: [0x20,0x28,0x96,0x0f] +// CHECK: smlal2 v0.4s, v1.8h, v1.h[2] // encoding: [0x20,0x20,0x61,0x4f] +// CHECK: smlal2 v0.2d, v1.4s, v1.s[2] // encoding: [0x20,0x28,0x81,0x4f] +// CHECK: smlal2 v0.2d, v1.4s, v22.s[2] // encoding: [0x20,0x28,0x96,0x4f] + + smlsl v0.4s, v1.4h, v2.h[2] + smlsl v0.2d, v1.2s, v2.s[2] + smlsl v0.2d, v1.2s, v22.s[2] + smlsl2 v0.4s, v1.8h, v1.h[2] + smlsl2 v0.2d, v1.4s, v1.s[2] + smlsl2 v0.2d, v1.4s, v22.s[2] + +// CHECK: smlsl v0.4s, v1.4h, v2.h[2] // encoding: [0x20,0x60,0x62,0x0f] +// CHECK: smlsl v0.2d, v1.2s, v2.s[2] // encoding: [0x20,0x68,0x82,0x0f] +// CHECK: smlsl v0.2d, v1.2s, v22.s[2] // encoding: [0x20,0x68,0x96,0x0f] +// CHECK: smlsl2 v0.4s, v1.8h, v1.h[2] // encoding: [0x20,0x60,0x61,0x4f] +// CHECK: smlsl2 v0.2d, v1.4s, v1.s[2] // encoding: [0x20,0x68,0x81,0x4f] +// CHECK: smlsl2 v0.2d, v1.4s, v22.s[2] // encoding: [0x20,0x68,0x96,0x4f] + + sqdmlal v0.4s, v1.4h, v2.h[2] + sqdmlal v0.2d, v1.2s, v2.s[2] + sqdmlal v0.2d, v1.2s, v22.s[2] + sqdmlal2 v0.4s, v1.8h, v1.h[2] + sqdmlal2 v0.2d, v1.4s, v1.s[2] + sqdmlal2 v0.2d, v1.4s, v22.s[2] + +// CHECK: sqdmlal v0.4s, v1.4h, v2.h[2] // encoding: [0x20,0x30,0x62,0x0f] +// CHECK: sqdmlal v0.2d, v1.2s, v2.s[2] // encoding: [0x20,0x38,0x82,0x0f] +// CHECK: sqdmlal v0.2d, v1.2s, v22.s[2] // encoding: [0x20,0x38,0x96,0x0f] +// CHECK: sqdmlal2 v0.4s, v1.8h, v1.h[2] // encoding: [0x20,0x30,0x61,0x4f] +// CHECK: sqdmlal2 v0.2d, v1.4s, v1.s[2] // encoding: [0x20,0x38,0x81,0x4f] +// CHECK: sqdmlal2 v0.2d, v1.4s, v22.s[2] // encoding: [0x20,0x38,0x96,0x4f] + + umlal v0.4s, v1.4h, v2.h[2] + umlal v0.2d, v1.2s, v2.s[2] + umlal v0.2d, v1.2s, v22.s[2] + umlal2 v0.4s, v1.8h, v1.h[2] + umlal2 v0.2d, v1.4s, v1.s[2] + umlal2 v0.2d, v1.4s, v22.s[2] + +// CHECK: umlal v0.4s, v1.4h, v2.h[2] // encoding: [0x20,0x20,0x62,0x2f] +// CHECK: umlal v0.2d, v1.2s, v2.s[2] // encoding: [0x20,0x28,0x82,0x2f] +// CHECK: umlal v0.2d, v1.2s, v22.s[2] // encoding: [0x20,0x28,0x96,0x2f] +// CHECK: umlal2 v0.4s, v1.8h, v1.h[2] // encoding: [0x20,0x20,0x61,0x6f] +// CHECK: umlal2 v0.2d, v1.4s, v1.s[2] // encoding: [0x20,0x28,0x81,0x6f] +// CHECK: umlal2 v0.2d, v1.4s, v22.s[2] // encoding: [0x20,0x28,0x96,0x6f] + + umlsl v0.4s, v1.4h, v2.h[2] + umlsl v0.2d, v1.2s, v2.s[2] + umlsl v0.2d, v1.2s, v22.s[2] + umlsl2 v0.4s, v1.8h, v1.h[2] + umlsl2 v0.2d, v1.4s, v1.s[2] + umlsl2 v0.2d, v1.4s, v22.s[2] + +// CHECK: umlsl v0.4s, v1.4h, v2.h[2] // encoding: [0x20,0x60,0x62,0x2f] +// CHECK: umlsl v0.2d, v1.2s, v2.s[2] // encoding: [0x20,0x68,0x82,0x2f] +// CHECK: umlsl v0.2d, v1.2s, v22.s[2] // encoding: [0x20,0x68,0x96,0x2f] +// CHECK: umlsl2 v0.4s, v1.8h, v1.h[2] // encoding: [0x20,0x60,0x61,0x6f] +// CHECK: umlsl2 v0.2d, v1.4s, v1.s[2] // encoding: [0x20,0x68,0x81,0x6f] +// CHECK: umlsl2 v0.2d, v1.4s, v22.s[2] // encoding: [0x20,0x68,0x96,0x6f] + + sqdmlsl v0.4s, v1.4h, v2.h[2] + sqdmlsl v0.2d, v1.2s, v2.s[2] + sqdmlsl v0.2d, v1.2s, v22.s[2] + sqdmlsl2 v0.4s, v1.8h, v1.h[2] + sqdmlsl2 v0.2d, v1.4s, v1.s[2] + sqdmlsl2 v0.2d, v1.4s, v22.s[2] + +// CHECK: sqdmlsl v0.4s, v1.4h, v2.h[2] // encoding: [0x20,0x70,0x62,0x0f] +// CHECK: sqdmlsl v0.2d, v1.2s, v2.s[2] // encoding: [0x20,0x78,0x82,0x0f] +// CHECK: sqdmlsl v0.2d, v1.2s, v22.s[2] // encoding: [0x20,0x78,0x96,0x0f] +// CHECK: sqdmlsl2 v0.4s, v1.8h, v1.h[2] // encoding: [0x20,0x70,0x61,0x4f] +// CHECK: sqdmlsl2 v0.2d, v1.4s, v1.s[2] // encoding: [0x20,0x78,0x81,0x4f] +// CHECK: sqdmlsl2 v0.2d, v1.4s, v22.s[2] // encoding: [0x20,0x78,0x96,0x4f] + + mul v0.4h, v1.4h, v2.h[2] + mul v0.8h, v1.8h, v2.h[2] + mul v0.2s, v1.2s, v2.s[2] + mul v0.2s, v1.2s, v22.s[2] + mul v0.4s, v1.4s, v2.s[2] + mul v0.4s, v1.4s, v22.s[2] + +// CHECK: mul v0.4h, v1.4h, v2.h[2] // encoding: [0x20,0x80,0x62,0x0f] +// CHECK: mul v0.8h, v1.8h, v2.h[2] // encoding: [0x20,0x80,0x62,0x4f] +// CHECK: mul v0.2s, v1.2s, v2.s[2] // encoding: [0x20,0x88,0x82,0x0f] +// CHECK: mul v0.2s, v1.2s, v22.s[2] // encoding: [0x20,0x88,0x96,0x0f] +// CHECK: mul v0.4s, v1.4s, v2.s[2] // encoding: [0x20,0x88,0x82,0x4f] +// CHECK: mul v0.4s, v1.4s, v22.s[2] // encoding: [0x20,0x88,0x96,0x4f] + + fmul v0.2s, v1.2s, v2.s[2] + fmul v0.2s, v1.2s, v22.s[2] + fmul v0.4s, v1.4s, v2.s[2] + fmul v0.4s, v1.4s, v22.s[2] + fmul v0.2d, v1.2d, v2.d[1] + fmul v0.2d, v1.2d, v22.d[1] + +// CHECK: fmul v0.2s, v1.2s, v2.s[2] // encoding: [0x20,0x98,0x82,0x0f] +// CHECK: fmul v0.2s, v1.2s, v22.s[2] // encoding: [0x20,0x98,0x96,0x0f] +// CHECK: fmul v0.4s, v1.4s, v2.s[2] // encoding: [0x20,0x98,0x82,0x4f] +// CHECK: fmul v0.4s, v1.4s, v22.s[2] // encoding: [0x20,0x98,0x96,0x4f] +// CHECK: fmul v0.2d, v1.2d, v2.d[1] // encoding: [0x20,0x98,0xc2,0x4f] +// CHECK: fmul v0.2d, v1.2d, v22.d[1] // encoding: [0x20,0x98,0xd6,0x4f] + + fmulx v0.2s, v1.2s, v2.s[2] + fmulx v0.2s, v1.2s, v22.s[2] + fmulx v0.4s, v1.4s, v2.s[2] + fmulx v0.4s, v1.4s, v22.s[2] + fmulx v0.2d, v1.2d, v2.d[1] + fmulx v0.2d, v1.2d, v22.d[1] + +// CHECK: fmulx v0.2s, v1.2s, v2.s[2] // encoding: [0x20,0x98,0x82,0x2f] +// CHECK: fmulx v0.2s, v1.2s, v22.s[2] // encoding: [0x20,0x98,0x96,0x2f] +// CHECK: fmulx v0.4s, v1.4s, v2.s[2] // encoding: [0x20,0x98,0x82,0x6f] +// CHECK: fmulx v0.4s, v1.4s, v22.s[2] // encoding: [0x20,0x98,0x96,0x6f] +// CHECK: fmulx v0.2d, v1.2d, v2.d[1] // encoding: [0x20,0x98,0xc2,0x6f] +// CHECK: fmulx v0.2d, v1.2d, v22.d[1] // encoding: [0x20,0x98,0xd6,0x6f] + + smull v0.4s, v1.4h, v2.h[2] + smull v0.2d, v1.2s, v2.s[2] + smull v0.2d, v1.2s, v22.s[2] + smull2 v0.4s, v1.8h, v2.h[2] + smull2 v0.2d, v1.4s, v2.s[2] + smull2 v0.2d, v1.4s, v22.s[2] + +// CHECK: smull v0.4s, v1.4h, v2.h[2] // encoding: [0x20,0xa0,0x62,0x0f] +// CHECK: smull v0.2d, v1.2s, v2.s[2] // encoding: [0x20,0xa8,0x82,0x0f] +// CHECK: smull v0.2d, v1.2s, v22.s[2] // encoding: [0x20,0xa8,0x96,0x0f] +// CHECK: smull2 v0.4s, v1.8h, v2.h[2] // encoding: [0x20,0xa0,0x62,0x4f] +// CHECK: smull2 v0.2d, v1.4s, v2.s[2] // encoding: [0x20,0xa8,0x82,0x4f] +// CHECK: smull2 v0.2d, v1.4s, v22.s[2] // encoding: [0x20,0xa8,0x96,0x4f] + + umull v0.4s, v1.4h, v2.h[2] + umull v0.2d, v1.2s, v2.s[2] + umull v0.2d, v1.2s, v22.s[2] + umull2 v0.4s, v1.8h, v2.h[2] + umull2 v0.2d, v1.4s, v2.s[2] + umull2 v0.2d, v1.4s, v22.s[2] + +// CHECK: umull v0.4s, v1.4h, v2.h[2] // encoding: [0x20,0xa0,0x62,0x2f] +// CHECK: umull v0.2d, v1.2s, v2.s[2] // encoding: [0x20,0xa8,0x82,0x2f] +// CHECK: umull v0.2d, v1.2s, v22.s[2] // encoding: [0x20,0xa8,0x96,0x2f] +// CHECK: umull2 v0.4s, v1.8h, v2.h[2] // encoding: [0x20,0xa0,0x62,0x6f] +// CHECK: umull2 v0.2d, v1.4s, v2.s[2] // encoding: [0x20,0xa8,0x82,0x6f] +// CHECK: umull2 v0.2d, v1.4s, v22.s[2] // encoding: [0x20,0xa8,0x96,0x6f] + + sqdmull v0.4s, v1.4h, v2.h[2] + sqdmull v0.2d, v1.2s, v2.s[2] + sqdmull v0.2d, v1.2s, v22.s[2] + sqdmull2 v0.4s, v1.8h, v2.h[2] + sqdmull2 v0.2d, v1.4s, v2.s[2] + sqdmull2 v0.2d, v1.4s, v22.s[2] + +// CHECK: sqdmull v0.4s, v1.4h, v2.h[2] // encoding: [0x20,0xb0,0x62,0x0f] +// CHECK: sqdmull v0.2d, v1.2s, v2.s[2] // encoding: [0x20,0xb8,0x82,0x0f] +// CHECK: sqdmull v0.2d, v1.2s, v22.s[2] // encoding: [0x20,0xb8,0x96,0x0f] +// CHECK: sqdmull2 v0.4s, v1.8h, v2.h[2] // encoding: [0x20,0xb0,0x62,0x4f] +// CHECK: sqdmull2 v0.2d, v1.4s, v2.s[2] // encoding: [0x20,0xb8,0x82,0x4f] +// CHECK: sqdmull2 v0.2d, v1.4s, v22.s[2] // encoding: [0x20,0xb8,0x96,0x4f] + + sqdmulh v0.4h, v1.4h, v2.h[2] + sqdmulh v0.8h, v1.8h, v2.h[2] + sqdmulh v0.2s, v1.2s, v2.s[2] + sqdmulh v0.2s, v1.2s, v22.s[2] + sqdmulh v0.4s, v1.4s, v2.s[2] + sqdmulh v0.4s, v1.4s, v22.s[2] + +// CHECK: sqdmulh v0.4h, v1.4h, v2.h[2] // encoding: [0x20,0xc0,0x62,0x0f] +// CHECK: sqdmulh v0.8h, v1.8h, v2.h[2] // encoding: [0x20,0xc0,0x62,0x4f] +// CHECK: sqdmulh v0.2s, v1.2s, v2.s[2] // encoding: [0x20,0xc8,0x82,0x0f] +// CHECK: sqdmulh v0.2s, v1.2s, v22.s[2] // encoding: [0x20,0xc8,0x96,0x0f] +// CHECK: sqdmulh v0.4s, v1.4s, v2.s[2] // encoding: [0x20,0xc8,0x82,0x4f] +// CHECK: sqdmulh v0.4s, v1.4s, v22.s[2] // encoding: [0x20,0xc8,0x96,0x4f] + + sqrdmulh v0.4h, v1.4h, v2.h[2] + sqrdmulh v0.8h, v1.8h, v2.h[2] + sqrdmulh v0.2s, v1.2s, v2.s[2] + sqrdmulh v0.2s, v1.2s, v22.s[2] + sqrdmulh v0.4s, v1.4s, v2.s[2] + sqrdmulh v0.4s, v1.4s, v22.s[2] + +// CHECK: sqrdmulh v0.4h, v1.4h, v2.h[2] // encoding: [0x20,0xd0,0x62,0x0f] +// CHECK: sqrdmulh v0.8h, v1.8h, v2.h[2] // encoding: [0x20,0xd0,0x62,0x4f] +// CHECK: sqrdmulh v0.2s, v1.2s, v2.s[2] // encoding: [0x20,0xd8,0x82,0x0f] +// CHECK: sqrdmulh v0.2s, v1.2s, v22.s[2] // encoding: [0x20,0xd8,0x96,0x0f] +// CHECK: sqrdmulh v0.4s, v1.4s, v2.s[2] // encoding: [0x20,0xd8,0x82,0x4f] +// CHECK: sqrdmulh v0.4s, v1.4s, v22.s[2] // encoding: [0x20,0xd8,0x96,0x4f] Index: test/MC/AArch64/neon-diagnostics.s =================================================================== --- test/MC/AArch64/neon-diagnostics.s +++ test/MC/AArch64/neon-diagnostics.s @@ -2849,3 +2849,762 @@ // CHECK-ERROR: fminnmp v1.4s, v2.2d // CHECK-ERROR: ^ + mla v0.2d, v1.2d, v16.d[1] + mla v0.2s, v1.2s, v2.s[4] + mla v0.4s, v1.4s, v2.s[4] + mla v0.2h, v1.2h, v2.h[1] + mla v0.4h, v1.4h, v2.h[8] + mla v0.8h, v1.8h, v2.h[8] + mla v0.4h, v1.4h, v16.h[2] + mla v0.8h, v1.8h, v16.h[2] + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: mla v0.2d, v1.2d, v16.d[1] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: mla v0.2s, v1.2s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: mla v0.4s, v1.4s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: mla v0.2h, v1.2h, v2.h[1] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: mla v0.4h, v1.4h, v2.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: mla v0.8h, v1.8h, v2.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: mla v0.4h, v1.4h, v16.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: mla v0.8h, v1.8h, v16.h[2] +// CHECK-ERROR: ^ + + mls v0.2d, v1.2d, v16.d[1] + mls v0.2s, v1.2s, v2.s[4] + mls v0.4s, v1.4s, v2.s[4] + mls v0.2h, v1.2h, v2.h[1] + mls v0.4h, v1.4h, v2.h[8] + mls v0.8h, v1.8h, v2.h[8] + mls v0.4h, v1.4h, v16.h[2] + mls v0.8h, v1.8h, v16.h[2] + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: mls v0.2d, v1.2d, v16.d[1] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: mls v0.2s, v1.2s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: mls v0.4s, v1.4s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: mls v0.2h, v1.2h, v2.h[1] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: mls v0.4h, v1.4h, v2.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: mls v0.8h, v1.8h, v2.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: mls v0.4h, v1.4h, v16.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: mls v0.8h, v1.8h, v16.h[2] +// CHECK-ERROR: ^ + + fmla v0.4h, v1.4h, v2.h[2] + fmla v0.8h, v1.8h, v2.h[2] + fmla v0.2s, v1.2s, v2.s[4] + fmla v0.2s, v1.2s, v22.s[4] + fmla v3.4s, v8.4s, v2.s[4] + fmla v3.4s, v8.4s, v22.s[4] + fmla v0.2d, v1.2d, v2.d[2] + fmla v0.2d, v1.2d, v22.d[2] + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fmla v0.4h, v1.4h, v2.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fmla v0.8h, v1.8h, v2.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: fmla v0.2s, v1.2s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: fmla v0.2s, v1.2s, v22.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: fmla v3.4s, v8.4s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: fmla v3.4s, v8.4s, v22.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: fmla v0.2d, v1.2d, v2.d[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: fmla v0.2d, v1.2d, v22.d[2] +// CHECK-ERROR: ^ + + fmls v0.4h, v1.4h, v2.h[2] + fmls v0.8h, v1.8h, v2.h[2] + fmls v0.2s, v1.2s, v2.s[4] + fmls v0.2s, v1.2s, v22.s[4] + fmls v3.4s, v8.4s, v2.s[4] + fmls v3.4s, v8.4s, v22.s[4] + fmls v0.2d, v1.2d, v2.d[2] + fmls v0.2d, v1.2d, v22.d[2] + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fmls v0.4h, v1.4h, v2.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fmls v0.8h, v1.8h, v2.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: fmls v0.2s, v1.2s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: fmls v0.2s, v1.2s, v22.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: fmls v3.4s, v8.4s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: fmls v3.4s, v8.4s, v22.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: fmls v0.2d, v1.2d, v2.d[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: fmls v0.2d, v1.2d, v22.d[2] +// CHECK-ERROR: ^ + + smlal v0.4h, v1.4h, v2.h[2] + smlal v0.4s, v1.4h, v2.h[8] + smlal v0.4s, v1.4h, v16.h[2] + smlal v0.2s, v1.2s, v2.s[4] + smlal v0.2d, v1.2s, v2.s[4] + smlal v0.2d, v1.2s, v22.s[4] + smlal2 v0.4h, v1.8h, v1.h[2] + smlal2 v0.4s, v1.8h, v1.h[8] + smlal2 v0.4s, v1.8h, v16.h[2] + smlal2 v0.2s, v1.4s, v1.s[2] + smlal2 v0.2d, v1.4s, v1.s[4] + smlal2 v0.2d, v1.4s, v22.s[4] + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: smlal v0.4h, v1.4h, v2.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: smlal v0.4s, v1.4h, v2.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: smlal v0.4s, v1.4h, v16.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: smlal v0.2s, v1.2s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: smlal v0.2d, v1.2s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: smlal v0.2d, v1.2s, v22.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: smlal2 v0.4h, v1.8h, v1.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: smlal2 v0.4s, v1.8h, v1.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: smlal2 v0.4s, v1.8h, v16.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: smlal2 v0.2s, v1.4s, v1.s[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: smlal2 v0.2d, v1.4s, v1.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: smlal2 v0.2d, v1.4s, v22.s[4] +// CHECK-ERROR: ^ + + smlsl v0.4h, v1.4h, v2.h[2] + smlsl v0.4s, v1.4h, v2.h[8] + smlsl v0.4s, v1.4h, v16.h[2] + smlsl v0.2s, v1.2s, v2.s[4] + smlsl v0.2d, v1.2s, v2.s[4] + smlsl v0.2d, v1.2s, v22.s[4] + smlsl2 v0.4h, v1.8h, v1.h[2] + smlsl2 v0.4s, v1.8h, v1.h[8] + smlsl2 v0.4s, v1.8h, v16.h[2] + smlsl2 v0.2s, v1.4s, v1.s[2] + smlsl2 v0.2d, v1.4s, v1.s[4] + smlsl2 v0.2d, v1.4s, v22.s[4] + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: smlsl v0.4h, v1.4h, v2.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: smlsl v0.4s, v1.4h, v2.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: smlsl v0.4s, v1.4h, v16.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: smlsl v0.2s, v1.2s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: smlsl v0.2d, v1.2s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: smlsl v0.2d, v1.2s, v22.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: smlsl2 v0.4h, v1.8h, v1.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: smlsl2 v0.4s, v1.8h, v1.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: smlsl2 v0.4s, v1.8h, v16.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: smlsl2 v0.2s, v1.4s, v1.s[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: smlsl2 v0.2d, v1.4s, v1.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: smlsl2 v0.2d, v1.4s, v22.s[4] +// CHECK-ERROR: ^ + + umlal v0.4h, v1.4h, v2.h[2] + umlal v0.4s, v1.4h, v2.h[8] + umlal v0.4s, v1.4h, v16.h[2] + umlal v0.2s, v1.2s, v2.s[4] + umlal v0.2d, v1.2s, v2.s[4] + umlal v0.2d, v1.2s, v22.s[4] + umlal2 v0.4h, v1.8h, v1.h[2] + umlal2 v0.4s, v1.8h, v1.h[8] + umlal2 v0.4s, v1.8h, v16.h[2] + umlal2 v0.2s, v1.4s, v1.s[2] + umlal2 v0.2d, v1.4s, v1.s[4] + umlal2 v0.2d, v1.4s, v22.s[4] + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: umlal v0.4h, v1.4h, v2.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: umlal v0.4s, v1.4h, v2.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: umlal v0.4s, v1.4h, v16.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: umlal v0.2s, v1.2s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: umlal v0.2d, v1.2s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: umlal v0.2d, v1.2s, v22.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: umlal2 v0.4h, v1.8h, v1.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: umlal2 v0.4s, v1.8h, v1.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: umlal2 v0.4s, v1.8h, v16.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: umlal2 v0.2s, v1.4s, v1.s[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: umlal2 v0.2d, v1.4s, v1.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: umlal2 v0.2d, v1.4s, v22.s[4] +// CHECK-ERROR: ^ + + umlsl v0.4h, v1.4h, v2.h[2] + umlsl v0.4s, v1.4h, v2.h[8] + umlsl v0.4s, v1.4h, v16.h[2] + umlsl v0.2s, v1.2s, v2.s[4] + umlsl v0.2d, v1.2s, v2.s[4] + umlsl v0.2d, v1.2s, v22.s[4] + umlsl2 v0.4h, v1.8h, v1.h[2] + umlsl2 v0.4s, v1.8h, v1.h[8] + umlsl2 v0.4s, v1.8h, v16.h[2] + umlsl2 v0.2s, v1.4s, v1.s[2] + umlsl2 v0.2d, v1.4s, v1.s[4] + umlsl2 v0.2d, v1.4s, v22.s[4] + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: umlsl v0.4h, v1.4h, v2.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: umlsl v0.4s, v1.4h, v2.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: umlsl v0.4s, v1.4h, v16.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: umlsl v0.2s, v1.2s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: umlsl v0.2d, v1.2s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: umlsl v0.2d, v1.2s, v22.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: umlsl2 v0.4h, v1.8h, v1.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: umlsl2 v0.4s, v1.8h, v1.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: umlsl2 v0.4s, v1.8h, v16.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: umlsl2 v0.2s, v1.4s, v1.s[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: umlsl2 v0.2d, v1.4s, v1.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: umlsl2 v0.2d, v1.4s, v22.s[4] +// CHECK-ERROR: ^ + + sqdmlal v0.4h, v1.4h, v2.h[2] + sqdmlal v0.4s, v1.4h, v2.h[8] + sqdmlal v0.4s, v1.4h, v16.h[2] + sqdmlal v0.2s, v1.2s, v2.s[4] + sqdmlal v0.2d, v1.2s, v2.s[4] + sqdmlal v0.2d, v1.2s, v22.s[4] + sqdmlal2 v0.4h, v1.8h, v1.h[2] + sqdmlal2 v0.4s, v1.8h, v1.h[8] + sqdmlal2 v0.4s, v1.8h, v16.h[2] + sqdmlal2 v0.2s, v1.4s, v1.s[2] + sqdmlal2 v0.2d, v1.4s, v1.s[4] + sqdmlal2 v0.2d, v1.4s, v22.s[4] + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqdmlal v0.4h, v1.4h, v2.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqdmlal v0.4s, v1.4h, v2.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqdmlal v0.4s, v1.4h, v16.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqdmlal v0.2s, v1.2s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqdmlal v0.2d, v1.2s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqdmlal v0.2d, v1.2s, v22.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqdmlal2 v0.4h, v1.8h, v1.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqdmlal2 v0.4s, v1.8h, v1.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqdmlal2 v0.4s, v1.8h, v16.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqdmlal2 v0.2s, v1.4s, v1.s[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqdmlal2 v0.2d, v1.4s, v1.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqdmlal2 v0.2d, v1.4s, v22.s[4] +// CHECK-ERROR: ^ + + sqdmlsl v0.4h, v1.4h, v2.h[2] + sqdmlsl v0.4s, v1.4h, v2.h[8] + sqdmlsl v0.4s, v1.4h, v16.h[2] + sqdmlsl v0.2s, v1.2s, v2.s[4] + sqdmlsl v0.2d, v1.2s, v2.s[4] + sqdmlsl v0.2d, v1.2s, v22.s[4] + sqdmlsl2 v0.4h, v1.8h, v1.h[2] + sqdmlsl2 v0.4s, v1.8h, v1.h[8] + sqdmlsl2 v0.4s, v1.8h, v16.h[2] + sqdmlsl2 v0.2s, v1.4s, v1.s[2] + sqdmlsl2 v0.2d, v1.4s, v1.s[4] + sqdmlsl2 v0.2d, v1.4s, v22.s[4] + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqdmlsl v0.4h, v1.4h, v2.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqdmlsl v0.4s, v1.4h, v2.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqdmlsl v0.4s, v1.4h, v16.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqdmlsl v0.2s, v1.2s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqdmlsl v0.2d, v1.2s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqdmlsl v0.2d, v1.2s, v22.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqdmlsl2 v0.4h, v1.8h, v1.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqdmlsl2 v0.4s, v1.8h, v1.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqdmlsl2 v0.4s, v1.8h, v16.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqdmlsl2 v0.2s, v1.4s, v1.s[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqdmlsl2 v0.2d, v1.4s, v1.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqdmlsl2 v0.2d, v1.4s, v22.s[4] +// CHECK-ERROR: ^ + + mul v0.4h, v1.4h, v2.h[8] + mul v0.4h, v1.4h, v16.h[8] + mul v0.8h, v1.8h, v2.h[8] + mul v0.8h, v1.8h, v16.h[8] + mul v0.2s, v1.2s, v2.s[4] + mul v0.2s, v1.2s, v22.s[4] + mul v0.4s, v1.4s, v2.s[4] + mul v0.4s, v1.4s, v22.s[4] + mul v0.2d, v1.2d, v2.d[1] + +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: mul v0.4h, v1.4h, v2.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: mul v0.4h, v1.4h, v16.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: mul v0.8h, v1.8h, v2.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: mul v0.8h, v1.8h, v16.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: mul v0.2s, v1.2s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: mul v0.2s, v1.2s, v22.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: mul v0.4s, v1.4s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: mul v0.4s, v1.4s, v22.s[4] +// CHECK-ERROR: ^ + + fmul v0.4h, v1.4h, v2.h[4] + fmul v0.2s, v1.2s, v2.s[4] + fmul v0.2s, v1.2s, v22.s[4] + fmul v0.4s, v1.4s, v2.s[4] + fmul v0.4s, v1.4s, v22.s[4] + fmul v0.2d, v1.2d, v2.d[2] + fmul v0.2d, v1.2d, v22.d[2] + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: mul v0.2d, v1.2d, v2.d[1] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fmul v0.4h, v1.4h, v2.h[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: fmul v0.2s, v1.2s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: fmul v0.2s, v1.2s, v22.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: fmul v0.4s, v1.4s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: fmul v0.4s, v1.4s, v22.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: fmul v0.2d, v1.2d, v2.d[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: fmul v0.2d, v1.2d, v22.d[2] +// CHECK-ERROR: ^ + + fmulx v0.4h, v1.4h, v2.h[4] + fmulx v0.2s, v1.2s, v2.s[4] + fmulx v0.2s, v1.2s, v22.s[4] + fmulx v0.4s, v1.4s, v2.s[4] + fmulx v0.4s, v1.4s, v22.s[4] + fmulx v0.2d, v1.2d, v2.d[2] + fmulx v0.2d, v1.2d, v22.d[2] + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: fmulx v0.4h, v1.4h, v2.h[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: fmulx v0.2s, v1.2s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: fmulx v0.2s, v1.2s, v22.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: fmulx v0.4s, v1.4s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: fmulx v0.4s, v1.4s, v22.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: fmulx v0.2d, v1.2d, v2.d[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: fmulx v0.2d, v1.2d, v22.d[2] +// CHECK-ERROR: ^ + + smull v0.4h, v1.4h, v2.h[2] + smull v0.4s, v1.4h, v2.h[8] + smull v0.4s, v1.4h, v16.h[4] + smull v0.2s, v1.2s, v2.s[2] + smull v0.2d, v1.2s, v2.s[4] + smull v0.2d, v1.2s, v22.s[4] + smull2 v0.4h, v1.8h, v2.h[2] + smull2 v0.4s, v1.8h, v2.h[8] + smull2 v0.4s, v1.8h, v16.h[4] + smull2 v0.2s, v1.4s, v2.s[2] + smull2 v0.2d, v1.4s, v2.s[4] + smull2 v0.2d, v1.4s, v22.s[4] + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: smull v0.4h, v1.4h, v2.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: smull v0.4s, v1.4h, v2.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: smull v0.4s, v1.4h, v16.h[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: smull v0.2s, v1.2s, v2.s[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: smull v0.2d, v1.2s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: smull v0.2d, v1.2s, v22.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: smull2 v0.4h, v1.8h, v2.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: smull2 v0.4s, v1.8h, v2.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: smull2 v0.4s, v1.8h, v16.h[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: smull2 v0.2s, v1.4s, v2.s[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: smull2 v0.2d, v1.4s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: smull2 v0.2d, v1.4s, v22.s[4] +// CHECK-ERROR: ^ + + umull v0.4h, v1.4h, v2.h[2] + umull v0.4s, v1.4h, v2.h[8] + umull v0.4s, v1.4h, v16.h[4] + umull v0.2s, v1.2s, v2.s[2] + umull v0.2d, v1.2s, v2.s[4] + umull v0.2d, v1.2s, v22.s[4] + umull2 v0.4h, v1.8h, v2.h[2] + umull2 v0.4s, v1.8h, v2.h[8] + umull2 v0.4s, v1.8h, v16.h[4] + umull2 v0.2s, v1.4s, v2.s[2] + umull2 v0.2d, v1.4s, v2.s[4] + umull2 v0.2d, v1.4s, v22.s[4] + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: umull v0.4h, v1.4h, v2.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: umull v0.4s, v1.4h, v2.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: umull v0.4s, v1.4h, v16.h[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: umull v0.2s, v1.2s, v2.s[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: umull v0.2d, v1.2s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: umull v0.2d, v1.2s, v22.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: umull2 v0.4h, v1.8h, v2.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: umull2 v0.4s, v1.8h, v2.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: umull2 v0.4s, v1.8h, v16.h[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: umull2 v0.2s, v1.4s, v2.s[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: umull2 v0.2d, v1.4s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: umull2 v0.2d, v1.4s, v22.s[4] +// CHECK-ERROR: ^ + + sqdmull v0.4h, v1.4h, v2.h[2] + sqdmull v0.4s, v1.4h, v2.h[8] + sqdmull v0.4s, v1.4h, v16.h[4] + sqdmull v0.2s, v1.2s, v2.s[2] + sqdmull v0.2d, v1.2s, v2.s[4] + sqdmull v0.2d, v1.2s, v22.s[4] + sqdmull2 v0.4h, v1.8h, v2.h[2] + sqdmull2 v0.4s, v1.8h, v2.h[8] + sqdmull2 v0.4s, v1.8h, v16.h[4] + sqdmull2 v0.2s, v1.4s, v2.s[2] + sqdmull2 v0.2d, v1.4s, v2.s[4] + sqdmull2 v0.2d, v1.4s, v22.s[4] + +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqdmull v0.4h, v1.4h, v2.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqdmull v0.4s, v1.4h, v2.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqdmull v0.4s, v1.4h, v16.h[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqdmull v0.2s, v1.2s, v2.s[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqdmull v0.2d, v1.2s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqdmull v0.2d, v1.2s, v22.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqdmull2 v0.4h, v1.8h, v2.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqdmull2 v0.4s, v1.8h, v2.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqdmull2 v0.4s, v1.8h, v16.h[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqdmull2 v0.2s, v1.4s, v2.s[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqdmull2 v0.2d, v1.4s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqdmull2 v0.2d, v1.4s, v22.s[4] +// CHECK-ERROR: ^ + + sqdmulh v0.4h, v1.4h, v2.h[8] + sqdmulh v0.4h, v1.4h, v16.h[2] + sqdmulh v0.8h, v1.8h, v2.h[8] + sqdmulh v0.8h, v1.8h, v16.h[2] + sqdmulh v0.2s, v1.2s, v2.s[4] + sqdmulh v0.2s, v1.2s, v22.s[4] + sqdmulh v0.4s, v1.4s, v2.s[4] + sqdmulh v0.4s, v1.4s, v22.s[4] + sqdmulh v0.2d, v1.2d, v22.d[1] + +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqdmulh v0.4h, v1.4h, v2.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqdmulh v0.4h, v1.4h, v16.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqdmulh v0.8h, v1.8h, v2.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqdmulh v0.8h, v1.8h, v16.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqdmulh v0.2s, v1.2s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqdmulh v0.2s, v1.2s, v22.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqdmulh v0.4s, v1.4s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqdmulh v0.4s, v1.4s, v22.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqdmulh v0.2d, v1.2d, v22.d[1] +// CHECK-ERROR: ^ + + sqrdmulh v0.4h, v1.4h, v2.h[8] + sqrdmulh v0.4h, v1.4h, v16.h[2] + sqrdmulh v0.8h, v1.8h, v2.h[8] + sqrdmulh v0.8h, v1.8h, v16.h[2] + sqrdmulh v0.2s, v1.2s, v2.s[4] + sqrdmulh v0.2s, v1.2s, v22.s[4] + sqrdmulh v0.4s, v1.4s, v2.s[4] + sqrdmulh v0.4s, v1.4s, v22.s[4] + sqrdmulh v0.2d, v1.2d, v22.d[1] + +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqrdmulh v0.4h, v1.4h, v2.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqrdmulh v0.4h, v1.4h, v16.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqrdmulh v0.8h, v1.8h, v2.h[8] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqrdmulh v0.8h, v1.8h, v16.h[2] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqrdmulh v0.2s, v1.2s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqrdmulh v0.2s, v1.2s, v22.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqrdmulh v0.4s, v1.4s, v2.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: lane number incompatible with layout +// CHECK-ERROR: sqrdmulh v0.4s, v1.4s, v22.s[4] +// CHECK-ERROR: ^ +// CHECK-ERROR: error: invalid operand for instruction +// CHECK-ERROR: sqrdmulh v0.2d, v1.2d, v22.d[1] +// CHECK-ERROR: ^