diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -51,39 +51,39 @@ def OP_FMLS_N : Op<(call "vfma", $p0, (op "-", $p1), (dup $p2))>; def OP_MLAL_N : Op<(op "+", $p0, (call "vmull", $p1, (dup $p2)))>; def OP_MLSL_N : Op<(op "-", $p0, (call "vmull", $p1, (dup $p2)))>; -def OP_MUL_LN : Op<(op "*", $p0, (splat $p1, $p2))>; -def OP_MULX_LN : Op<(call "vmulx", $p0, (splat $p1, $p2))>; +def OP_MUL_LN : Op<(op "*", $p0, (call_mangled "splat_lane", $p1, $p2))>; +def OP_MULX_LN : Op<(call "vmulx", $p0, (call_mangled "splat_lane", $p1, $p2))>; def OP_MULL_N : Op<(call "vmull", $p0, (dup $p1))>; -def OP_MULL_LN : Op<(call "vmull", $p0, (splat $p1, $p2))>; -def OP_MULLHi_LN: Op<(call "vmull", (call "vget_high", $p0), (splat $p1, $p2))>; -def OP_MLA_LN : Op<(op "+", $p0, (op "*", $p1, (splat $p2, $p3)))>; -def OP_MLS_LN : Op<(op "-", $p0, (op "*", $p1, (splat $p2, $p3)))>; -def OP_MLAL_LN : Op<(op "+", $p0, (call "vmull", $p1, (splat $p2, $p3)))>; +def OP_MULL_LN : Op<(call "vmull", $p0, (call_mangled "splat_lane", $p1, $p2))>; +def OP_MULLHi_LN: Op<(call "vmull", (call "vget_high", $p0), (call_mangled "splat_lane", $p1, $p2))>; +def OP_MLA_LN : Op<(op "+", $p0, (op "*", $p1, (call_mangled "splat_lane", $p2, $p3)))>; +def OP_MLS_LN : Op<(op "-", $p0, (op "*", $p1, (call_mangled "splat_lane", $p2, $p3)))>; +def OP_MLAL_LN : Op<(op "+", $p0, (call "vmull", $p1, (call_mangled "splat_lane", $p2, $p3)))>; def OP_MLALHi_LN: Op<(op "+", $p0, (call "vmull", (call "vget_high", $p1), - (splat $p2, $p3)))>; -def OP_MLSL_LN : Op<(op "-", $p0, (call "vmull", $p1, (splat $p2, $p3)))>; + (call_mangled "splat_lane", $p2, $p3)))>; +def OP_MLSL_LN : Op<(op "-", $p0, (call "vmull", $p1, (call_mangled "splat_lane", $p2, $p3)))>; def OP_MLSLHi_LN : Op<(op "-", $p0, (call "vmull", (call "vget_high", $p1), - (splat $p2, $p3)))>; + (call_mangled "splat_lane", $p2, $p3)))>; def OP_QDMULL_N : Op<(call "vqdmull", $p0, (dup $p1))>; -def OP_QDMULL_LN : Op<(call "vqdmull", $p0, (splat $p1, $p2))>; +def OP_QDMULL_LN : Op<(call "vqdmull", $p0, (call_mangled "splat_lane", $p1, $p2))>; def OP_QDMULLHi_LN : Op<(call "vqdmull", (call "vget_high", $p0), - (splat $p1, $p2))>; + (call_mangled "splat_lane", $p1, $p2))>; def OP_QDMLAL_N : Op<(call "vqdmlal", $p0, $p1, (dup $p2))>; -def OP_QDMLAL_LN : Op<(call "vqdmlal", $p0, $p1, (splat $p2, $p3))>; +def OP_QDMLAL_LN : Op<(call "vqdmlal", $p0, $p1, (call_mangled "splat_lane", $p2, $p3))>; def OP_QDMLALHi_LN : Op<(call "vqdmlal", $p0, (call "vget_high", $p1), - (splat $p2, $p3))>; + (call_mangled "splat_lane", $p2, $p3))>; def OP_QDMLSL_N : Op<(call "vqdmlsl", $p0, $p1, (dup $p2))>; -def OP_QDMLSL_LN : Op<(call "vqdmlsl", $p0, $p1, (splat $p2, $p3))>; +def OP_QDMLSL_LN : Op<(call "vqdmlsl", $p0, $p1, (call_mangled "splat_lane", $p2, $p3))>; def OP_QDMLSLHi_LN : Op<(call "vqdmlsl", $p0, (call "vget_high", $p1), - (splat $p2, $p3))>; + (call_mangled "splat_lane", $p2, $p3))>; def OP_QDMULH_N : Op<(call "vqdmulh", $p0, (dup $p1))>; -def OP_QDMULH_LN : Op<(call "vqdmulh", $p0, (splat $p1, $p2))>; -def OP_QRDMULH_LN : Op<(call "vqrdmulh", $p0, (splat $p1, $p2))>; +def OP_QDMULH_LN : Op<(call "vqdmulh", $p0, (call_mangled "splat_lane", $p1, $p2))>; +def OP_QRDMULH_LN : Op<(call "vqrdmulh", $p0, (call_mangled "splat_lane", $p1, $p2))>; def OP_QRDMULH_N : Op<(call "vqrdmulh", $p0, (dup $p1))>; def OP_QRDMLAH : Op<(call "vqadd", $p0, (call "vqrdmulh", $p1, $p2))>; def OP_QRDMLSH : Op<(call "vqsub", $p0, (call "vqrdmulh", $p1, $p2))>; -def OP_QRDMLAH_LN : Op<(call "vqadd", $p0, (call "vqrdmulh", $p1, (splat $p2, $p3)))>; -def OP_QRDMLSH_LN : Op<(call "vqsub", $p0, (call "vqrdmulh", $p1, (splat $p2, $p3)))>; +def OP_QRDMLAH_LN : Op<(call "vqadd", $p0, (call "vqrdmulh", $p1, (call_mangled "splat_lane", $p2, $p3)))>; +def OP_QRDMLSH_LN : Op<(call "vqsub", $p0, (call "vqrdmulh", $p1, (call_mangled "splat_lane", $p2, $p3)))>; def OP_FMS_LN : Op<(call "vfma_lane", $p0, (op "-", $p1), $p2, $p3)>; def OP_FMS_LNQ : Op<(call "vfma_laneq", $p0, (op "-", $p1), $p2, $p3)>; def OP_TRN1 : Op<(shuffle $p0, $p1, (interleave (decimate mask0, 2), @@ -115,7 +115,7 @@ def OP_LO : Op<(shuffle $p0, $p0, (lowhalf mask0))>; def OP_CONC : Op<(shuffle $p0, $p1, (add mask0, mask1))>; def OP_DUP : Op<(dup $p0)>; -def OP_DUP_LN : Op<(splat $p0, $p1)>; +def OP_DUP_LN : Op<(call_mangled "splat_lane", $p0, $p1)>; def OP_SEL : Op<(cast "R", (op "|", (op "&", $p0, (cast $p0, $p1)), (op "&", (op "~", $p0), (cast $p0, $p2))))>; @@ -207,10 +207,10 @@ def OP_DOT_LN : Op<(call "vdot", $p0, $p1, - (bitcast $p1, (splat(bitcast "uint32x2_t", $p2), $p3)))>; + (bitcast $p1, (call_mangled "splat_lane", (bitcast "32", $p2), $p3)))>; def OP_DOT_LNQ : Op<(call "vdot", $p0, $p1, - (bitcast $p1, (splat(bitcast "uint32x4_t", $p2), $p3)))>; + (bitcast $p1, (call_mangled "splat_lane", (bitcast "32", $p2), $p3)))>; def OP_FMLAL_LN : Op<(call "vfmlal_low", $p0, $p1, (dup_typed $p1, (call "vget_lane", $p2, $p3)))>; @@ -222,7 +222,19 @@ (dup_typed $p1, (call "vget_lane", $p2, $p3)))>; //===----------------------------------------------------------------------===// -// Instructions +// Auxiliary Instructions +//===----------------------------------------------------------------------===// + +// Splat operation - performs a range-checked splat over a vector +def SPLAT : WInst<"splat_lane", ".(!q)I", + "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPl">; +def SPLATQ : WInst<"splat_laneq", ".(!Q)I", + "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPl"> { + let isLaneQ = 1; +} + +//===----------------------------------------------------------------------===// +// Intrinsics //===----------------------------------------------------------------------===// //////////////////////////////////////////////////////////////////////////////// @@ -881,16 +893,22 @@ def COPYQ_LANE : IOpInst<"vcopy_lane", "..IqI", "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPl", OP_COPY_LN>; def COPY_LANEQ : IOpInst<"vcopy_laneq", "..IQI", - "csilPcPsPlUcUsUiUlfd", OP_COPY_LN>; + "csilPcPsPlUcUsUiUlfd", OP_COPY_LN> { + let isLaneQ = 1; +} def COPYQ_LANEQ : IOpInst<"vcopy_laneq", "..I.I", - "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPl", OP_COPY_LN>; + "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPl", OP_COPY_LN> { + let isLaneQ = 1; +} //////////////////////////////////////////////////////////////////////////////// // Set all lanes to same value def VDUP_LANE1: WOpInst<"vdup_lane", ".qI", "hdQhQdPlQPl", OP_DUP_LN>; def VDUP_LANE2: WOpInst<"vdup_laneq", ".QI", "csilUcUsUiUlPcPshfdQcQsQiQlQPcQPsQUcQUsQUiQUlQhQfQdPlQPl", - OP_DUP_LN>; + OP_DUP_LN> { + let isLaneQ = 1; +} def DUP_N : WOpInst<"vdup_n", ".1", "dQdPlQPl", OP_DUP>; def MOV_N : WOpInst<"vmov_n", ".1", "dQdPlQPl", OP_DUP>; @@ -906,38 +924,60 @@ //////////////////////////////////////////////////////////////////////////////// def VMLA_LANEQ : IOpInst<"vmla_laneq", "...QI", - "siUsUifQsQiQUsQUiQf", OP_MLA_LN>; + "siUsUifQsQiQUsQUiQf", OP_MLA_LN> { + let isLaneQ = 1; +} def VMLS_LANEQ : IOpInst<"vmls_laneq", "...QI", - "siUsUifQsQiQUsQUiQf", OP_MLS_LN>; + "siUsUifQsQiQUsQUiQf", OP_MLS_LN> { + let isLaneQ = 1; +} def VFMA_LANE : IInst<"vfma_lane", "...qI", "fdQfQd">; def VFMA_LANEQ : IInst<"vfma_laneq", "...QI", "fdQfQd"> { let isLaneQ = 1; } def VFMS_LANE : IOpInst<"vfms_lane", "...qI", "fdQfQd", OP_FMS_LN>; -def VFMS_LANEQ : IOpInst<"vfms_laneq", "...QI", "fdQfQd", OP_FMS_LNQ>; +def VFMS_LANEQ : IOpInst<"vfms_laneq", "...QI", "fdQfQd", OP_FMS_LNQ> { + let isLaneQ = 1; +} -def VMLAL_LANEQ : SOpInst<"vmlal_laneq", "(>Q)(>Q).QI", "siUsUi", OP_MLAL_LN>; +def VMLAL_LANEQ : SOpInst<"vmlal_laneq", "(>Q)(>Q).QI", "siUsUi", OP_MLAL_LN> { + let isLaneQ = 1; +} def VMLAL_HIGH_LANE : SOpInst<"vmlal_high_lane", "(>Q)(>Q)Q.I", "siUsUi", OP_MLALHi_LN>; def VMLAL_HIGH_LANEQ : SOpInst<"vmlal_high_laneq", "(>Q)(>Q)QQI", "siUsUi", - OP_MLALHi_LN>; -def VMLSL_LANEQ : SOpInst<"vmlsl_laneq", "(>Q)(>Q).QI", "siUsUi", OP_MLSL_LN>; + OP_MLALHi_LN> { + let isLaneQ = 1; +} +def VMLSL_LANEQ : SOpInst<"vmlsl_laneq", "(>Q)(>Q).QI", "siUsUi", OP_MLSL_LN> { + let isLaneQ = 1; +} def VMLSL_HIGH_LANE : SOpInst<"vmlsl_high_lane", "(>Q)(>Q)Q.I", "siUsUi", OP_MLSLHi_LN>; def VMLSL_HIGH_LANEQ : SOpInst<"vmlsl_high_laneq", "(>Q)(>Q)QQI", "siUsUi", - OP_MLSLHi_LN>; + OP_MLSLHi_LN> { + let isLaneQ = 1; +} -def VQDMLAL_LANEQ : SOpInst<"vqdmlal_laneq", "(>Q)(>Q).QI", "si", OP_QDMLAL_LN>; +def VQDMLAL_LANEQ : SOpInst<"vqdmlal_laneq", "(>Q)(>Q).QI", "si", OP_QDMLAL_LN> { + let isLaneQ = 1; +} def VQDMLAL_HIGH_LANE : SOpInst<"vqdmlal_high_lane", "(>Q)(>Q)Q.I", "si", OP_QDMLALHi_LN>; def VQDMLAL_HIGH_LANEQ : SOpInst<"vqdmlal_high_laneq", "(>Q)(>Q)QQI", "si", - OP_QDMLALHi_LN>; -def VQDMLSL_LANEQ : SOpInst<"vqdmlsl_laneq", "(>Q)(>Q).QI", "si", OP_QDMLSL_LN>; + OP_QDMLALHi_LN> { + let isLaneQ = 1; +} +def VQDMLSL_LANEQ : SOpInst<"vqdmlsl_laneq", "(>Q)(>Q).QI", "si", OP_QDMLSL_LN> { + let isLaneQ = 1; +} def VQDMLSL_HIGH_LANE : SOpInst<"vqdmlsl_high_lane", "(>Q)(>Q)Q.I", "si", OP_QDMLSLHi_LN>; def VQDMLSL_HIGH_LANEQ : SOpInst<"vqdmlsl_high_laneq", "(>Q)(>Q)QQI", "si", - OP_QDMLSLHi_LN>; + OP_QDMLSLHi_LN> { + let isLaneQ = 1; +} // Newly add double parameter for vmul_lane in aarch64 // Note: d type is handled by SCALAR_VMUL_LANE @@ -945,32 +985,48 @@ // Note: d type is handled by SCALAR_VMUL_LANEQ def VMUL_LANEQ : IOpInst<"vmul_laneq", "..QI", - "sifUsUiQsQiQUsQUiQfQd", OP_MUL_LN>; -def VMULL_LANEQ : SOpInst<"vmull_laneq", "(>Q).QI", "siUsUi", OP_MULL_LN>; + "sifUsUiQsQiQUsQUiQfQd", OP_MUL_LN> { + let isLaneQ = 1; +} +def VMULL_LANEQ : SOpInst<"vmull_laneq", "(>Q).QI", "siUsUi", OP_MULL_LN> { + let isLaneQ = 1; +} def VMULL_HIGH_LANE : SOpInst<"vmull_high_lane", "(>Q)Q.I", "siUsUi", OP_MULLHi_LN>; def VMULL_HIGH_LANEQ : SOpInst<"vmull_high_laneq", "(>Q)QQI", "siUsUi", - OP_MULLHi_LN>; + OP_MULLHi_LN> { + let isLaneQ = 1; +} -def VQDMULL_LANEQ : SOpInst<"vqdmull_laneq", "(>Q).QI", "si", OP_QDMULL_LN>; +def VQDMULL_LANEQ : SOpInst<"vqdmull_laneq", "(>Q).QI", "si", OP_QDMULL_LN> { + let isLaneQ = 1; +} def VQDMULL_HIGH_LANE : SOpInst<"vqdmull_high_lane", "(>Q)Q.I", "si", OP_QDMULLHi_LN>; def VQDMULL_HIGH_LANEQ : SOpInst<"vqdmull_high_laneq", "(>Q)QQI", "si", - OP_QDMULLHi_LN>; + OP_QDMULLHi_LN> { + let isLaneQ = 1; +} let isLaneQ = 1 in { def VQDMULH_LANEQ : SInst<"vqdmulh_laneq", "..QI", "siQsQi">; def VQRDMULH_LANEQ : SInst<"vqrdmulh_laneq", "..QI", "siQsQi">; } let ArchGuard = "defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__)" in { -def VQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "...QI", "siQsQi", OP_QRDMLAH_LN>; -def VQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "...QI", "siQsQi", OP_QRDMLSH_LN>; +def VQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "...QI", "siQsQi", OP_QRDMLAH_LN> { + let isLaneQ = 1; +} +def VQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "...QI", "siQsQi", OP_QRDMLSH_LN> { + let isLaneQ = 1; +} } // Note: d type implemented by SCALAR_VMULX_LANE def VMULX_LANE : IOpInst<"vmulx_lane", "..qI", "fQfQd", OP_MULX_LN>; // Note: d type is implemented by SCALAR_VMULX_LANEQ -def VMULX_LANEQ : IOpInst<"vmulx_laneq", "..QI", "fQfQd", OP_MULX_LN>; +def VMULX_LANEQ : IOpInst<"vmulx_laneq", "..QI", "fQfQd", OP_MULX_LN> { + let isLaneQ = 1; +} //////////////////////////////////////////////////////////////////////////////// // Across vectors class @@ -1380,11 +1436,15 @@ // Scalar Floating Point multiply (scalar, by element) def SCALAR_FMUL_LANE : IOpInst<"vmul_lane", "11.I", "SfSd", OP_SCALAR_MUL_LN>; -def SCALAR_FMUL_LANEQ : IOpInst<"vmul_laneq", "11QI", "SfSd", OP_SCALAR_MUL_LN>; +def SCALAR_FMUL_LANEQ : IOpInst<"vmul_laneq", "11QI", "SfSd", OP_SCALAR_MUL_LN> { + let isLaneQ = 1; +} // Scalar Floating Point multiply extended (scalar, by element) def SCALAR_FMULX_LANE : IOpInst<"vmulx_lane", "11.I", "SfSd", OP_SCALAR_MULX_LN>; -def SCALAR_FMULX_LANEQ : IOpInst<"vmulx_laneq", "11QI", "SfSd", OP_SCALAR_MULX_LN>; +def SCALAR_FMULX_LANEQ : IOpInst<"vmulx_laneq", "11QI", "SfSd", OP_SCALAR_MULX_LN> { + let isLaneQ = 1; +} def SCALAR_VMUL_N : IInst<"vmul_n", "..1", "d">; @@ -1400,48 +1460,70 @@ def SCALAR_VMULX_LANE : IOpInst<"vmulx_lane", "..qI", "d", OP_SCALAR_VMULX_LN>; // VMULX_LANEQ d type implemented using scalar vmulx_laneq -def SCALAR_VMULX_LANEQ : IOpInst<"vmulx_laneq", "..QI", "d", OP_SCALAR_VMULX_LNQ>; +def SCALAR_VMULX_LANEQ : IOpInst<"vmulx_laneq", "..QI", "d", OP_SCALAR_VMULX_LNQ> { + let isLaneQ = 1; +} // Scalar Floating Point fused multiply-add (scalar, by element) def SCALAR_FMLA_LANE : IInst<"vfma_lane", "111.I", "SfSd">; -def SCALAR_FMLA_LANEQ : IInst<"vfma_laneq", "111QI", "SfSd">; +def SCALAR_FMLA_LANEQ : IInst<"vfma_laneq", "111QI", "SfSd"> { + let isLaneQ = 1; +} // Scalar Floating Point fused multiply-subtract (scalar, by element) def SCALAR_FMLS_LANE : IOpInst<"vfms_lane", "111.I", "SfSd", OP_FMS_LN>; -def SCALAR_FMLS_LANEQ : IOpInst<"vfms_laneq", "111QI", "SfSd", OP_FMS_LNQ>; +def SCALAR_FMLS_LANEQ : IOpInst<"vfms_laneq", "111QI", "SfSd", OP_FMS_LNQ> { + let isLaneQ = 1; +} // Signed Saturating Doubling Multiply Long (scalar by element) def SCALAR_SQDMULL_LANE : SOpInst<"vqdmull_lane", "(1>)1.I", "SsSi", OP_SCALAR_QDMULL_LN>; -def SCALAR_SQDMULL_LANEQ : SOpInst<"vqdmull_laneq", "(1>)1QI", "SsSi", OP_SCALAR_QDMULL_LN>; +def SCALAR_SQDMULL_LANEQ : SOpInst<"vqdmull_laneq", "(1>)1QI", "SsSi", OP_SCALAR_QDMULL_LN> { + let isLaneQ = 1; +} // Signed Saturating Doubling Multiply-Add Long (scalar by element) def SCALAR_SQDMLAL_LANE : SInst<"vqdmlal_lane", "(1>)(1>)1.I", "SsSi">; -def SCALAR_SQDMLAL_LANEQ : SInst<"vqdmlal_laneq", "(1>)(1>)1QI", "SsSi">; +def SCALAR_SQDMLAL_LANEQ : SInst<"vqdmlal_laneq", "(1>)(1>)1QI", "SsSi"> { + let isLaneQ = 1; +} // Signed Saturating Doubling Multiply-Subtract Long (scalar by element) def SCALAR_SQDMLS_LANE : SInst<"vqdmlsl_lane", "(1>)(1>)1.I", "SsSi">; -def SCALAR_SQDMLS_LANEQ : SInst<"vqdmlsl_laneq", "(1>)(1>)1QI", "SsSi">; +def SCALAR_SQDMLS_LANEQ : SInst<"vqdmlsl_laneq", "(1>)(1>)1QI", "SsSi"> { + let isLaneQ = 1; +} // Scalar Integer Saturating Doubling Multiply Half High (scalar by element) def SCALAR_SQDMULH_LANE : SOpInst<"vqdmulh_lane", "11.I", "SsSi", OP_SCALAR_QDMULH_LN>; -def SCALAR_SQDMULH_LANEQ : SOpInst<"vqdmulh_laneq", "11QI", "SsSi", OP_SCALAR_QDMULH_LN>; +def SCALAR_SQDMULH_LANEQ : SOpInst<"vqdmulh_laneq", "11QI", "SsSi", OP_SCALAR_QDMULH_LN> { + let isLaneQ = 1; +} // Scalar Integer Saturating Rounding Doubling Multiply Half High def SCALAR_SQRDMULH_LANE : SOpInst<"vqrdmulh_lane", "11.I", "SsSi", OP_SCALAR_QRDMULH_LN>; -def SCALAR_SQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "11QI", "SsSi", OP_SCALAR_QRDMULH_LN>; +def SCALAR_SQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "11QI", "SsSi", OP_SCALAR_QRDMULH_LN> { + let isLaneQ = 1; +} let ArchGuard = "defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__)" in { // Signed Saturating Rounding Doubling Multiply Accumulate Returning High Half def SCALAR_SQRDMLAH_LANE : SOpInst<"vqrdmlah_lane", "111.I", "SsSi", OP_SCALAR_QRDMLAH_LN>; -def SCALAR_SQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "111QI", "SsSi", OP_SCALAR_QRDMLAH_LN>; +def SCALAR_SQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "111QI", "SsSi", OP_SCALAR_QRDMLAH_LN> { + let isLaneQ = 1; +} // Signed Saturating Rounding Doubling Multiply Subtract Returning High Half def SCALAR_SQRDMLSH_LANE : SOpInst<"vqrdmlsh_lane", "111.I", "SsSi", OP_SCALAR_QRDMLSH_LN>; -def SCALAR_SQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "111QI", "SsSi", OP_SCALAR_QRDMLSH_LN>; +def SCALAR_SQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "111QI", "SsSi", OP_SCALAR_QRDMLSH_LN> { + let isLaneQ = 1; +} } def SCALAR_VDUP_LANE : IInst<"vdup_lane", "1.I", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs">; -def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "1QI", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs">; +def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "1QI", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs"> { + let isLaneQ = 1; +} } // ARMv8.2-A FP16 vector intrinsics for A32/A64. @@ -1605,36 +1687,52 @@ // FMA lane def VFMA_LANEH : IInst<"vfma_lane", "...qI", "hQh">; - def VFMA_LANEQH : IInst<"vfma_laneq", "...QI", "hQh">; + def VFMA_LANEQH : IInst<"vfma_laneq", "...QI", "hQh"> { + let isLaneQ = 1; + } // FMA lane with scalar argument def FMLA_NH : SOpInst<"vfma_n", "...1", "hQh", OP_FMLA_N>; // Scalar floating point fused multiply-add (scalar, by element) def SCALAR_FMLA_LANEH : IInst<"vfma_lane", "111.I", "Sh">; - def SCALAR_FMLA_LANEQH : IInst<"vfma_laneq", "111QI", "Sh">; + def SCALAR_FMLA_LANEQH : IInst<"vfma_laneq", "111QI", "Sh"> { + let isLaneQ = 1; + } // FMS lane def VFMS_LANEH : IOpInst<"vfms_lane", "...qI", "hQh", OP_FMS_LN>; - def VFMS_LANEQH : IOpInst<"vfms_laneq", "...QI", "hQh", OP_FMS_LNQ>; + def VFMS_LANEQH : IOpInst<"vfms_laneq", "...QI", "hQh", OP_FMS_LNQ> { + let isLaneQ = 1; + } // FMS lane with scalar argument def FMLS_NH : SOpInst<"vfms_n", "...1", "hQh", OP_FMLS_N>; // Scalar floating foint fused multiply-subtract (scalar, by element) def SCALAR_FMLS_LANEH : IOpInst<"vfms_lane", "111.I", "Sh", OP_FMS_LN>; - def SCALAR_FMLS_LANEQH : IOpInst<"vfms_laneq", "111QI", "Sh", OP_FMS_LNQ>; + def SCALAR_FMLS_LANEQH : IOpInst<"vfms_laneq", "111QI", "Sh", OP_FMS_LNQ> { + let isLaneQ = 1; + } // Mul lane - def VMUL_LANEQH : IOpInst<"vmul_laneq", "..QI", "hQh", OP_MUL_LN>; + def VMUL_LANEQH : IOpInst<"vmul_laneq", "..QI", "hQh", OP_MUL_LN> { + let isLaneQ = 1; + } // Scalar floating point multiply (scalar, by element) def SCALAR_FMUL_LANEH : IOpInst<"vmul_lane", "11.I", "Sh", OP_SCALAR_MUL_LN>; - def SCALAR_FMUL_LANEQH : IOpInst<"vmul_laneq", "11QI", "Sh", OP_SCALAR_MUL_LN>; + def SCALAR_FMUL_LANEQH : IOpInst<"vmul_laneq", "11QI", "Sh", OP_SCALAR_MUL_LN> { + let isLaneQ = 1; + } // Mulx lane def VMULX_LANEH : IOpInst<"vmulx_lane", "..qI", "hQh", OP_MULX_LN>; - def VMULX_LANEQH : IOpInst<"vmulx_laneq", "..QI", "hQh", OP_MULX_LN>; + def VMULX_LANEQH : IOpInst<"vmulx_laneq", "..QI", "hQh", OP_MULX_LN> { + let isLaneQ = 1; + } def VMULX_NH : IOpInst<"vmulx_n", "..1", "hQh", OP_MULX_N>; // Scalar floating point mulx (scalar, by element) def SCALAR_FMULX_LANEH : IInst<"vmulx_lane", "11.I", "Sh">; - def SCALAR_FMULX_LANEQH : IInst<"vmulx_laneq", "11QI", "Sh">; + def SCALAR_FMULX_LANEQH : IInst<"vmulx_laneq", "11QI", "Sh"> { + let isLaneQ = 1; + } // ARMv8.2-A FP16 reduction vector intrinsics. def VMAXVH : SInst<"vmaxv", "1.", "hQh">; @@ -1651,7 +1749,9 @@ def VUZP2H : SOpInst<"vuzp2", "...", "hQh", OP_UZP2>; def SCALAR_VDUP_LANEH : IInst<"vdup_lane", "1.I", "Sh">; - def SCALAR_VDUP_LANEQH : IInst<"vdup_laneq", "1QI", "Sh">; + def SCALAR_VDUP_LANEQH : IInst<"vdup_laneq", "1QI", "Sh"> { + let isLaneQ = 1; + } } // v8.2-A dot product instructions. @@ -1661,7 +1761,9 @@ } let ArchGuard = "defined(__ARM_FEATURE_DOTPROD) && defined(__aarch64__)" in { // Variants indexing into a 128-bit vector are A64 only. - def UDOT_LANEQ : SOpInst<"vdot_laneq", "..(<<)(<; + def UDOT_LANEQ : SOpInst<"vdot_laneq", "..(<<)(< { + let isLaneQ = 1; + } } // v8.2-A FP16 fused multiply-add long instructions. @@ -1676,10 +1778,18 @@ def VFMLAL_LANE_HIGH : SOpInst<"vfmlal_lane_high", "(F>)(F>)F(Fq)I", "hQh", OP_FMLAL_LN_Hi>; def VFMLSL_LANE_HIGH : SOpInst<"vfmlsl_lane_high", "(F>)(F>)F(Fq)I", "hQh", OP_FMLSL_LN_Hi>; - def VFMLAL_LANEQ_LOW : SOpInst<"vfmlal_laneq_low", "(F>)(F>)F(FQ)I", "hQh", OP_FMLAL_LN>; - def VFMLSL_LANEQ_LOW : SOpInst<"vfmlsl_laneq_low", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN>; - def VFMLAL_LANEQ_HIGH : SOpInst<"vfmlal_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLAL_LN_Hi>; - def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN_Hi>; + def VFMLAL_LANEQ_LOW : SOpInst<"vfmlal_laneq_low", "(F>)(F>)F(FQ)I", "hQh", OP_FMLAL_LN> { + let isLaneQ = 1; + } + def VFMLSL_LANEQ_LOW : SOpInst<"vfmlsl_laneq_low", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN> { + let isLaneQ = 1; + } + def VFMLAL_LANEQ_HIGH : SOpInst<"vfmlal_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLAL_LN_Hi> { + let isLaneQ = 1; + } + def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN_Hi> { + let isLaneQ = 1; + } } // v8.3-A Vector complex addition intrinsics diff --git a/clang/include/clang/Basic/arm_neon_incl.td b/clang/include/clang/Basic/arm_neon_incl.td --- a/clang/include/clang/Basic/arm_neon_incl.td +++ b/clang/include/clang/Basic/arm_neon_incl.td @@ -60,6 +60,11 @@ // example: (call "vget_high", $p0) -> "vgetq_high_s16(__p0)" // (assuming $p0 has type int16x8_t). def call; +// call_mangled - Invoke another intrinsic matching the mangled name variation +// of the caller's base type. If there is no intrinsic defined +// that has the variation and takes the given types, an error +// is generated at tblgen time. +def call_mangled; // cast - Perform a cast to a different type. This gets emitted as a static // C-style cast. For a pure reinterpret cast (T x = *(T*)&y), use // "bitcast". @@ -79,6 +84,7 @@ // - "D" - Double the number of lanes in the type. // - "8" - Convert type to an equivalent vector of 8-bit signed // integers. +// - "32" - Convert type to an equivalent vector of 32-bit integers. // example: (cast "R", "U", $p0) -> "(uint32x4_t)__p0" (assuming the return // value is of type "int32x4_t". // (cast $p0, "D", "8", $p1) -> "(int8x16_t)__p1" (assuming __p0 @@ -100,12 +106,6 @@ // example: (dup_typed $p1, $p2) -> "(float16x4_t) {__p2, __p2, __p2, __p2}" // (assuming __p1 is float16x4_t, and __p2 is a compatible scalar). def dup_typed; -// splat - Take a vector and a lane index, and return a vector of the same type -// containing repeated instances of the source vector at the lane index. -// example: (splat $p0, $p1) -> -// "__builtin_shufflevector(__p0, __p0, __p1, __p1, __p1, __p1)" -// (assuming __p0 has four elements). -def splat; // save_temp - Create a temporary (local) variable. The variable takes a name // based on the zero'th parameter and can be referenced using // using that name in subsequent DAGs in the same diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -4493,10 +4493,14 @@ } } +Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C, unsigned NumElements) { + Value* SV = llvm::ConstantVector::getSplat(NumElements, C); + return Builder.CreateShuffleVector(V, V, SV, "lane"); +} + Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C) { unsigned nElts = V->getType()->getVectorNumElements(); - Value* SV = llvm::ConstantVector::getSplat(nElts, C); - return Builder.CreateShuffleVector(V, V, SV, "lane"); + return EmitNeonSplat(V, C, nElts); } Value *CodeGenFunction::EmitNeonCall(Function *F, SmallVectorImpl &Ops, @@ -4603,6 +4607,10 @@ TypeModifier } static const NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = { + NEONMAP0(splat_lane_v), + NEONMAP0(splat_laneq_v), + NEONMAP0(splatq_lane_v), + NEONMAP0(splatq_laneq_v), NEONMAP2(vabd_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts), NEONMAP2(vabdq_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts), NEONMAP1(vabs_v, arm_neon_vabs, 0), @@ -4884,6 +4892,10 @@ }; static const NeonIntrinsicInfo AArch64SIMDIntrinsicMap[] = { + NEONMAP0(splat_lane_v), + NEONMAP0(splat_laneq_v), + NEONMAP0(splatq_lane_v), + NEONMAP0(splatq_laneq_v), NEONMAP1(vabs_v, aarch64_neon_abs, 0), NEONMAP1(vabsq_v, aarch64_neon_abs, 0), NEONMAP0(vaddhn_v), @@ -5441,6 +5453,17 @@ switch (BuiltinID) { default: break; + case NEON::BI__builtin_neon_splat_lane_v: + case NEON::BI__builtin_neon_splat_laneq_v: + case NEON::BI__builtin_neon_splatq_lane_v: + case NEON::BI__builtin_neon_splatq_laneq_v: { + auto NumElements = VTy->getNumElements(); + if (BuiltinID == NEON::BI__builtin_neon_splatq_lane_v) NumElements *= 2; + if (BuiltinID == NEON::BI__builtin_neon_splat_laneq_v) NumElements /= 2; + + Ops[0] = Builder.CreateBitCast(Ops[0], VTy); + return EmitNeonSplat(Ops[0], cast(Ops[1]), NumElements); + } case NEON::BI__builtin_neon_vpadd_v: case NEON::BI__builtin_neon_vpaddq_v: // We don't allow fp/int overloading of intrinsics. diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -3779,6 +3779,7 @@ SmallVectorImpl &O, const char *name, unsigned shift = 0, bool rightshift = false); + llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx, unsigned NumElements); llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx); llvm::Value *EmitNeonShiftVector(llvm::Value *V, llvm::Type *Ty, bool negateForRightShift); diff --git a/clang/test/CodeGen/aarch64-neon-2velem.c b/clang/test/CodeGen/aarch64-neon-2velem.c --- a/clang/test/CodeGen/aarch64-neon-2velem.c +++ b/clang/test/CodeGen/aarch64-neon-2velem.c @@ -7,8 +7,10 @@ // CHECK-LABEL: @test_vmla_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] // @@ -18,8 +20,10 @@ // CHECK-LABEL: @test_vmlaq_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] // @@ -29,8 +33,10 @@ // CHECK-LABEL: @test_vmla_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] // @@ -40,8 +46,10 @@ // CHECK-LABEL: @test_vmlaq_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -51,8 +59,10 @@ // CHECK-LABEL: @test_vmla_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] // @@ -62,8 +72,10 @@ // CHECK-LABEL: @test_vmlaq_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] // @@ -73,8 +85,10 @@ // CHECK-LABEL: @test_vmla_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] // @@ -84,8 +98,10 @@ // CHECK-LABEL: @test_vmlaq_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -95,8 +111,10 @@ // CHECK-LABEL: @test_vmls_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] // @@ -106,8 +124,10 @@ // CHECK-LABEL: @test_vmlsq_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] // @@ -117,8 +137,10 @@ // CHECK-LABEL: @test_vmls_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] // @@ -128,8 +150,10 @@ // CHECK-LABEL: @test_vmlsq_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -139,8 +163,10 @@ // CHECK-LABEL: @test_vmls_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] // @@ -150,8 +176,10 @@ // CHECK-LABEL: @test_vmlsq_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] // @@ -161,8 +189,10 @@ // CHECK-LABEL: @test_vmls_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] // @@ -172,8 +202,10 @@ // CHECK-LABEL: @test_vmlsq_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -183,8 +215,10 @@ // CHECK-LABEL: @test_vmul_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) { @@ -193,8 +227,10 @@ // CHECK-LABEL: @test_vmulq_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) { @@ -203,8 +239,10 @@ // CHECK-LABEL: @test_vmul_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) { @@ -213,8 +251,10 @@ // CHECK-LABEL: @test_vmulq_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) { @@ -223,8 +263,10 @@ // CHECK-LABEL: @test_vmul_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) { @@ -233,8 +275,10 @@ // CHECK-LABEL: @test_vmulq_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) { @@ -243,8 +287,10 @@ // CHECK-LABEL: @test_vmul_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) { @@ -253,8 +299,10 @@ // CHECK-LABEL: @test_vmulq_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) { @@ -263,8 +311,10 @@ // CHECK-LABEL: @test_vmul_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) { @@ -273,8 +323,10 @@ // CHECK-LABEL: @test_vmulq_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) { @@ -283,8 +335,10 @@ // CHECK-LABEL: @test_vmul_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) { @@ -293,8 +347,10 @@ // CHECK-LABEL: @test_vmulq_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) { @@ -303,8 +359,10 @@ // CHECK-LABEL: @test_vmul_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) { @@ -313,8 +371,10 @@ // CHECK-LABEL: @test_vmulq_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) { @@ -323,8 +383,10 @@ // CHECK-LABEL: @test_vmul_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) { @@ -333,8 +395,10 @@ // CHECK-LABEL: @test_vmulq_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) { @@ -584,10 +648,12 @@ // CHECK-LABEL: @test_vmlal_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -597,10 +663,12 @@ // CHECK-LABEL: @test_vmlal_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -610,10 +678,12 @@ // CHECK-LABEL: @test_vmlal_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -623,10 +693,12 @@ // CHECK-LABEL: @test_vmlal_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -637,10 +709,12 @@ // CHECK-LABEL: @test_vmlal_high_lane_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -651,10 +725,12 @@ // CHECK-LABEL: @test_vmlal_high_lane_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -665,10 +741,12 @@ // CHECK-LABEL: @test_vmlal_high_laneq_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -679,10 +757,12 @@ // CHECK-LABEL: @test_vmlal_high_laneq_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -692,10 +772,12 @@ // CHECK-LABEL: @test_vmlsl_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -705,10 +787,12 @@ // CHECK-LABEL: @test_vmlsl_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -718,10 +802,12 @@ // CHECK-LABEL: @test_vmlsl_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -731,10 +817,12 @@ // CHECK-LABEL: @test_vmlsl_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -745,10 +833,12 @@ // CHECK-LABEL: @test_vmlsl_high_lane_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -759,10 +849,12 @@ // CHECK-LABEL: @test_vmlsl_high_lane_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -773,10 +865,12 @@ // CHECK-LABEL: @test_vmlsl_high_laneq_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -787,10 +881,12 @@ // CHECK-LABEL: @test_vmlsl_high_laneq_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -800,10 +896,12 @@ // CHECK-LABEL: @test_vmlal_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -813,10 +911,12 @@ // CHECK-LABEL: @test_vmlal_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -826,10 +926,12 @@ // CHECK-LABEL: @test_vmlal_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -839,10 +941,12 @@ // CHECK-LABEL: @test_vmlal_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -853,10 +957,12 @@ // CHECK-LABEL: @test_vmlal_high_lane_u16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -867,10 +973,12 @@ // CHECK-LABEL: @test_vmlal_high_lane_u32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -881,10 +989,12 @@ // CHECK-LABEL: @test_vmlal_high_laneq_u16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -895,10 +1005,12 @@ // CHECK-LABEL: @test_vmlal_high_laneq_u32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -908,10 +1020,12 @@ // CHECK-LABEL: @test_vmlsl_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -921,10 +1035,12 @@ // CHECK-LABEL: @test_vmlsl_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -934,10 +1050,12 @@ // CHECK-LABEL: @test_vmlsl_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -947,10 +1065,12 @@ // CHECK-LABEL: @test_vmlsl_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -961,10 +1081,12 @@ // CHECK-LABEL: @test_vmlsl_high_lane_u16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -975,10 +1097,12 @@ // CHECK-LABEL: @test_vmlsl_high_lane_u32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -989,10 +1113,12 @@ // CHECK-LABEL: @test_vmlsl_high_laneq_u16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -1003,10 +1129,12 @@ // CHECK-LABEL: @test_vmlsl_high_laneq_u32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -1016,10 +1144,12 @@ // CHECK-LABEL: @test_vmull_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) { @@ -1028,10 +1158,12 @@ // CHECK-LABEL: @test_vmull_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) { @@ -1040,10 +1172,12 @@ // CHECK-LABEL: @test_vmull_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) { @@ -1052,10 +1186,12 @@ // CHECK-LABEL: @test_vmull_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) { @@ -1065,10 +1201,12 @@ // CHECK-LABEL: @test_vmull_high_lane_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) { @@ -1078,10 +1216,12 @@ // CHECK-LABEL: @test_vmull_high_lane_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) { @@ -1091,10 +1231,12 @@ // CHECK-LABEL: @test_vmull_high_lane_u16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) { @@ -1104,10 +1246,12 @@ // CHECK-LABEL: @test_vmull_high_lane_u32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) { @@ -1116,10 +1260,12 @@ // CHECK-LABEL: @test_vmull_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) { @@ -1128,10 +1274,12 @@ // CHECK-LABEL: @test_vmull_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) { @@ -1140,10 +1288,12 @@ // CHECK-LABEL: @test_vmull_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) { @@ -1152,10 +1302,12 @@ // CHECK-LABEL: @test_vmull_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) { @@ -1165,10 +1317,12 @@ // CHECK-LABEL: @test_vmull_high_laneq_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) { @@ -1178,10 +1332,12 @@ // CHECK-LABEL: @test_vmull_high_laneq_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) { @@ -1191,10 +1347,12 @@ // CHECK-LABEL: @test_vmull_high_laneq_u16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) { @@ -1204,10 +1362,12 @@ // CHECK-LABEL: @test_vmull_high_laneq_u32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) { @@ -1216,11 +1376,13 @@ // CHECK-LABEL: @test_vqdmlal_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // @@ -1230,11 +1392,13 @@ // CHECK-LABEL: @test_vqdmlal_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // @@ -1245,11 +1409,13 @@ // CHECK-LABEL: @test_vqdmlal_high_lane_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // @@ -1260,11 +1426,13 @@ // CHECK-LABEL: @test_vqdmlal_high_lane_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // @@ -1274,11 +1442,13 @@ // CHECK-LABEL: @test_vqdmlsl_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // @@ -1288,11 +1458,13 @@ // CHECK-LABEL: @test_vqdmlsl_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // @@ -1303,11 +1475,13 @@ // CHECK-LABEL: @test_vqdmlsl_high_lane_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // @@ -1318,11 +1492,13 @@ // CHECK-LABEL: @test_vqdmlsl_high_lane_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // @@ -1332,10 +1508,12 @@ // CHECK-LABEL: @test_vqdmull_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -1345,10 +1523,12 @@ // CHECK-LABEL: @test_vqdmull_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -1358,10 +1538,12 @@ // CHECK-LABEL: @test_vqdmull_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -1371,10 +1553,12 @@ // CHECK-LABEL: @test_vqdmull_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -1385,10 +1569,12 @@ // CHECK-LABEL: @test_vqdmull_high_lane_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -1399,10 +1585,12 @@ // CHECK-LABEL: @test_vqdmull_high_lane_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -1413,10 +1601,12 @@ // CHECK-LABEL: @test_vqdmull_high_laneq_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -1427,10 +1617,12 @@ // CHECK-LABEL: @test_vqdmull_high_laneq_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -1544,8 +1736,10 @@ // CHECK-LABEL: @test_vmul_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x float> [[MUL]] // float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) { @@ -1571,8 +1765,10 @@ // CHECK-LABEL: @test_vmulq_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x float> [[MUL]] // float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) { @@ -1581,8 +1777,10 @@ // CHECK-LABEL: @test_vmulq_lane_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <1 x double> [[V:%.*]], <1 x double> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x double> [[MUL]] // float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) { @@ -1591,8 +1789,10 @@ // CHECK-LABEL: @test_vmul_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x float> [[MUL]] // float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) { @@ -1617,8 +1817,10 @@ // CHECK-LABEL: @test_vmulq_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x float> [[MUL]] // float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) { @@ -1627,8 +1829,10 @@ // CHECK-LABEL: @test_vmulq_laneq_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x double> [[MUL]] // float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) { @@ -1637,10 +1841,12 @@ // CHECK-LABEL: @test_vmulx_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4 // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] // float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) { @@ -1649,10 +1855,12 @@ // CHECK-LABEL: @test_vmulxq_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4 // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] // float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) { @@ -1661,10 +1869,12 @@ // CHECK-LABEL: @test_vmulxq_lane_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <1 x double> [[V:%.*]], <1 x double> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4 // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] // float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) { @@ -1673,10 +1883,12 @@ // CHECK-LABEL: @test_vmulx_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4 // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] // float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) { @@ -1685,10 +1897,12 @@ // CHECK-LABEL: @test_vmulxq_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4 // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] // float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) { @@ -1697,10 +1911,12 @@ // CHECK-LABEL: @test_vmulxq_laneq_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4 // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] // float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) { @@ -1709,8 +1925,10 @@ // CHECK-LABEL: @test_vmla_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] // @@ -1720,8 +1938,10 @@ // CHECK-LABEL: @test_vmlaq_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] // @@ -1731,8 +1951,10 @@ // CHECK-LABEL: @test_vmla_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] // @@ -1742,8 +1964,10 @@ // CHECK-LABEL: @test_vmlaq_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -1753,8 +1977,10 @@ // CHECK-LABEL: @test_vmla_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] // @@ -1764,8 +1990,10 @@ // CHECK-LABEL: @test_vmlaq_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] // @@ -1775,8 +2003,10 @@ // CHECK-LABEL: @test_vmla_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] // @@ -1786,8 +2016,10 @@ // CHECK-LABEL: @test_vmlaq_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -1797,8 +2029,10 @@ // CHECK-LABEL: @test_vmls_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] // @@ -1808,8 +2042,10 @@ // CHECK-LABEL: @test_vmlsq_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] // @@ -1819,8 +2055,10 @@ // CHECK-LABEL: @test_vmls_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] // @@ -1830,8 +2068,10 @@ // CHECK-LABEL: @test_vmlsq_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -1841,8 +2081,10 @@ // CHECK-LABEL: @test_vmls_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] // @@ -1852,8 +2094,10 @@ // CHECK-LABEL: @test_vmlsq_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] // @@ -1863,8 +2107,10 @@ // CHECK-LABEL: @test_vmls_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] // @@ -1874,8 +2120,10 @@ // CHECK-LABEL: @test_vmlsq_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -1885,8 +2133,10 @@ // CHECK-LABEL: @test_vmul_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) { @@ -1895,8 +2145,10 @@ // CHECK-LABEL: @test_vmulq_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) { @@ -1905,8 +2157,10 @@ // CHECK-LABEL: @test_vmul_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) { @@ -1915,8 +2169,10 @@ // CHECK-LABEL: @test_vmulq_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) { @@ -1925,8 +2181,10 @@ // CHECK-LABEL: @test_vmul_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) { @@ -1935,8 +2193,10 @@ // CHECK-LABEL: @test_vmulq_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) { @@ -1945,8 +2205,10 @@ // CHECK-LABEL: @test_vmul_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) { @@ -1955,8 +2217,10 @@ // CHECK-LABEL: @test_vmulq_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) { @@ -1965,8 +2229,10 @@ // CHECK-LABEL: @test_vmul_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) { @@ -1975,8 +2241,10 @@ // CHECK-LABEL: @test_vmulq_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) { @@ -1985,8 +2253,10 @@ // CHECK-LABEL: @test_vmul_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) { @@ -1995,8 +2265,10 @@ // CHECK-LABEL: @test_vmulq_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) { @@ -2005,8 +2277,10 @@ // CHECK-LABEL: @test_vmul_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) { @@ -2015,8 +2289,10 @@ // CHECK-LABEL: @test_vmulq_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) { @@ -2025,8 +2301,10 @@ // CHECK-LABEL: @test_vmul_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) { @@ -2035,8 +2313,10 @@ // CHECK-LABEL: @test_vmulq_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) { @@ -2210,10 +2490,12 @@ // CHECK-LABEL: @test_vmlal_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2223,10 +2505,12 @@ // CHECK-LABEL: @test_vmlal_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2236,10 +2520,12 @@ // CHECK-LABEL: @test_vmlal_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2249,10 +2535,12 @@ // CHECK-LABEL: @test_vmlal_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2263,10 +2551,12 @@ // CHECK-LABEL: @test_vmlal_high_lane_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2277,10 +2567,12 @@ // CHECK-LABEL: @test_vmlal_high_lane_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2291,10 +2583,12 @@ // CHECK-LABEL: @test_vmlal_high_laneq_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2305,10 +2599,12 @@ // CHECK-LABEL: @test_vmlal_high_laneq_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2318,10 +2614,12 @@ // CHECK-LABEL: @test_vmlsl_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2331,10 +2629,12 @@ // CHECK-LABEL: @test_vmlsl_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2344,10 +2644,12 @@ // CHECK-LABEL: @test_vmlsl_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2357,10 +2659,12 @@ // CHECK-LABEL: @test_vmlsl_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2371,10 +2675,12 @@ // CHECK-LABEL: @test_vmlsl_high_lane_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2385,10 +2691,12 @@ // CHECK-LABEL: @test_vmlsl_high_lane_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2399,10 +2707,12 @@ // CHECK-LABEL: @test_vmlsl_high_laneq_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2413,10 +2723,12 @@ // CHECK-LABEL: @test_vmlsl_high_laneq_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2426,10 +2738,12 @@ // CHECK-LABEL: @test_vmlal_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2439,10 +2753,12 @@ // CHECK-LABEL: @test_vmlal_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2452,10 +2768,12 @@ // CHECK-LABEL: @test_vmlal_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2465,10 +2783,12 @@ // CHECK-LABEL: @test_vmlal_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2479,10 +2799,12 @@ // CHECK-LABEL: @test_vmlal_high_lane_u16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2493,10 +2815,12 @@ // CHECK-LABEL: @test_vmlal_high_lane_u32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2507,10 +2831,12 @@ // CHECK-LABEL: @test_vmlal_high_laneq_u16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2521,10 +2847,12 @@ // CHECK-LABEL: @test_vmlal_high_laneq_u32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2534,10 +2862,12 @@ // CHECK-LABEL: @test_vmlsl_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2547,10 +2877,12 @@ // CHECK-LABEL: @test_vmlsl_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2560,10 +2892,12 @@ // CHECK-LABEL: @test_vmlsl_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2573,10 +2907,12 @@ // CHECK-LABEL: @test_vmlsl_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2587,10 +2923,12 @@ // CHECK-LABEL: @test_vmlsl_high_lane_u16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2601,10 +2939,12 @@ // CHECK-LABEL: @test_vmlsl_high_lane_u32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2615,10 +2955,12 @@ // CHECK-LABEL: @test_vmlsl_high_laneq_u16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2629,10 +2971,12 @@ // CHECK-LABEL: @test_vmlsl_high_laneq_u32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2642,10 +2986,12 @@ // CHECK-LABEL: @test_vmull_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) { @@ -2654,10 +3000,12 @@ // CHECK-LABEL: @test_vmull_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) { @@ -2666,10 +3014,12 @@ // CHECK-LABEL: @test_vmull_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) { @@ -2678,10 +3028,12 @@ // CHECK-LABEL: @test_vmull_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) { @@ -2691,10 +3043,12 @@ // CHECK-LABEL: @test_vmull_high_lane_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) { @@ -2704,10 +3058,12 @@ // CHECK-LABEL: @test_vmull_high_lane_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) { @@ -2717,10 +3073,12 @@ // CHECK-LABEL: @test_vmull_high_lane_u16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) { @@ -2730,10 +3088,12 @@ // CHECK-LABEL: @test_vmull_high_lane_u32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) { @@ -2742,10 +3102,12 @@ // CHECK-LABEL: @test_vmull_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) { @@ -2754,10 +3116,12 @@ // CHECK-LABEL: @test_vmull_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) { @@ -2766,10 +3130,12 @@ // CHECK-LABEL: @test_vmull_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) { @@ -2778,10 +3144,12 @@ // CHECK-LABEL: @test_vmull_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) { @@ -2791,10 +3159,12 @@ // CHECK-LABEL: @test_vmull_high_laneq_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) { @@ -2804,10 +3174,12 @@ // CHECK-LABEL: @test_vmull_high_laneq_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) { @@ -2817,10 +3189,12 @@ // CHECK-LABEL: @test_vmull_high_laneq_u16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) { @@ -2830,10 +3204,12 @@ // CHECK-LABEL: @test_vmull_high_laneq_u32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) { @@ -2842,11 +3218,13 @@ // CHECK-LABEL: @test_vqdmlal_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // @@ -2856,11 +3234,13 @@ // CHECK-LABEL: @test_vqdmlal_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // @@ -2871,11 +3251,13 @@ // CHECK-LABEL: @test_vqdmlal_high_lane_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // @@ -2886,11 +3268,13 @@ // CHECK-LABEL: @test_vqdmlal_high_lane_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // @@ -2900,11 +3284,13 @@ // CHECK-LABEL: @test_vqdmlsl_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // @@ -2914,11 +3300,13 @@ // CHECK-LABEL: @test_vqdmlsl_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // @@ -2929,11 +3317,13 @@ // CHECK-LABEL: @test_vqdmlsl_high_lane_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // @@ -2944,11 +3334,13 @@ // CHECK-LABEL: @test_vqdmlsl_high_lane_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // @@ -2958,10 +3350,12 @@ // CHECK-LABEL: @test_vqdmull_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -2971,10 +3365,12 @@ // CHECK-LABEL: @test_vqdmull_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -2984,10 +3380,12 @@ // CHECK-LABEL: @test_vqdmull_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -2997,10 +3395,12 @@ // CHECK-LABEL: @test_vqdmull_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -3011,10 +3411,12 @@ // CHECK-LABEL: @test_vqdmull_high_lane_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -3025,10 +3427,12 @@ // CHECK-LABEL: @test_vqdmull_high_lane_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -3039,10 +3443,12 @@ // CHECK-LABEL: @test_vqdmull_high_laneq_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -3053,10 +3459,12 @@ // CHECK-LABEL: @test_vqdmull_high_laneq_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -3170,8 +3578,10 @@ // CHECK-LABEL: @test_vmul_lane_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x float> [[MUL]] // float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) { @@ -3180,8 +3590,10 @@ // CHECK-LABEL: @test_vmulq_lane_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x float> [[MUL]] // float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) { @@ -3190,8 +3602,10 @@ // CHECK-LABEL: @test_vmul_laneq_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x float> [[MUL]] // float32x2_t test_vmul_laneq_f32_0(float32x2_t a, float32x4_t v) { @@ -3215,8 +3629,10 @@ // CHECK-LABEL: @test_vmulq_laneq_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x float> [[MUL]] // float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) { @@ -3225,8 +3641,10 @@ // CHECK-LABEL: @test_vmulq_laneq_f64_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x double> [[MUL]] // float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) { @@ -3235,10 +3653,12 @@ // CHECK-LABEL: @test_vmulx_lane_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4 // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] // float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) { @@ -3247,10 +3667,12 @@ // CHECK-LABEL: @test_vmulxq_lane_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4 // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] // float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) { @@ -3259,10 +3681,12 @@ // CHECK-LABEL: @test_vmulxq_lane_f64_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <1 x double> [[V:%.*]], <1 x double> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4 // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] // float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) { @@ -3271,10 +3695,12 @@ // CHECK-LABEL: @test_vmulx_laneq_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4 // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] // float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) { @@ -3283,10 +3709,12 @@ // CHECK-LABEL: @test_vmulxq_laneq_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4 // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] // float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) { @@ -3295,10 +3723,12 @@ // CHECK-LABEL: @test_vmulxq_laneq_f64_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4 // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] // float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) { @@ -4461,8 +4891,10 @@ // CHECK-LABEL: @test_vmla_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] // @@ -4472,8 +4904,10 @@ // CHECK-LABEL: @test_vmlaq_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] // @@ -4483,8 +4917,10 @@ // CHECK-LABEL: @test_vmla_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] // @@ -4494,8 +4930,10 @@ // CHECK-LABEL: @test_vmlaq_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -4505,8 +4943,10 @@ // CHECK-LABEL: @test_vmla_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] // @@ -4516,8 +4956,10 @@ // CHECK-LABEL: @test_vmlaq_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] // @@ -4527,8 +4969,10 @@ // CHECK-LABEL: @test_vmla_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] // @@ -4538,8 +4982,10 @@ // CHECK-LABEL: @test_vmlaq_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -4549,11 +4995,13 @@ // CHECK-LABEL: @test_vqdmlal_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // @@ -4563,11 +5011,13 @@ // CHECK-LABEL: @test_vqdmlal_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // @@ -4578,11 +5028,13 @@ // CHECK-LABEL: @test_vqdmlal_high_laneq_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // @@ -4593,11 +5045,13 @@ // CHECK-LABEL: @test_vqdmlal_high_laneq_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // @@ -4607,8 +5061,10 @@ // CHECK-LABEL: @test_vmls_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] // @@ -4618,8 +5074,10 @@ // CHECK-LABEL: @test_vmlsq_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] // @@ -4629,8 +5087,10 @@ // CHECK-LABEL: @test_vmls_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] // @@ -4640,8 +5100,10 @@ // CHECK-LABEL: @test_vmlsq_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -4651,8 +5113,10 @@ // CHECK-LABEL: @test_vmls_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] // @@ -4662,8 +5126,10 @@ // CHECK-LABEL: @test_vmlsq_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] // @@ -4673,8 +5139,10 @@ // CHECK-LABEL: @test_vmls_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] // @@ -4684,8 +5152,10 @@ // CHECK-LABEL: @test_vmlsq_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -4695,11 +5165,13 @@ // CHECK-LABEL: @test_vqdmlsl_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // @@ -4709,11 +5181,13 @@ // CHECK-LABEL: @test_vqdmlsl_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // @@ -4724,11 +5198,13 @@ // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // @@ -4739,11 +5215,13 @@ // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // @@ -4857,8 +5335,10 @@ // CHECK-LABEL: @test_vmla_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] // @@ -4868,8 +5348,10 @@ // CHECK-LABEL: @test_vmlaq_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] // @@ -4879,8 +5361,10 @@ // CHECK-LABEL: @test_vmla_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] // @@ -4890,8 +5374,10 @@ // CHECK-LABEL: @test_vmlaq_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -4901,8 +5387,10 @@ // CHECK-LABEL: @test_vmla_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] // @@ -4912,8 +5400,10 @@ // CHECK-LABEL: @test_vmlaq_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] // @@ -4923,8 +5413,10 @@ // CHECK-LABEL: @test_vmla_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] // @@ -4934,8 +5426,10 @@ // CHECK-LABEL: @test_vmlaq_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -4945,11 +5439,13 @@ // CHECK-LABEL: @test_vqdmlal_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // @@ -4959,11 +5455,13 @@ // CHECK-LABEL: @test_vqdmlal_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // @@ -4974,11 +5472,13 @@ // CHECK-LABEL: @test_vqdmlal_high_laneq_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // @@ -4989,11 +5489,13 @@ // CHECK-LABEL: @test_vqdmlal_high_laneq_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // @@ -5003,8 +5505,10 @@ // CHECK-LABEL: @test_vmls_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] // @@ -5014,8 +5518,10 @@ // CHECK-LABEL: @test_vmlsq_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] // @@ -5025,8 +5531,10 @@ // CHECK-LABEL: @test_vmls_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] // @@ -5036,8 +5544,10 @@ // CHECK-LABEL: @test_vmlsq_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -5047,8 +5557,10 @@ // CHECK-LABEL: @test_vmls_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] // @@ -5058,8 +5570,10 @@ // CHECK-LABEL: @test_vmlsq_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] // @@ -5069,8 +5583,10 @@ // CHECK-LABEL: @test_vmls_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] // @@ -5080,8 +5596,10 @@ // CHECK-LABEL: @test_vmlsq_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -5091,11 +5609,13 @@ // CHECK-LABEL: @test_vqdmlsl_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // @@ -5105,11 +5625,13 @@ // CHECK-LABEL: @test_vqdmlsl_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // @@ -5120,11 +5642,13 @@ // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // @@ -5135,11 +5659,13 @@ // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // diff --git a/clang/test/CodeGen/aarch64-neon-fma.c b/clang/test/CodeGen/aarch64-neon-fma.c --- a/clang/test/CodeGen/aarch64-neon-fma.c +++ b/clang/test/CodeGen/aarch64-neon-fma.c @@ -69,144 +69,177 @@ } // CHECK-LABEL: define <2 x float> @test_vmla_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]] -// CHECK: ret <2 x float> [[ADD]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <2 x float> [[ADD]] float32x2_t test_vmla_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) { return vmla_lane_f32(a, b, v, 0); } // CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]] -// CHECK: ret <4 x float> [[ADD]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <4 x float> [[ADD]] float32x4_t test_vmlaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) { return vmlaq_lane_f32(a, b, v, 0); } // CHECK-LABEL: define <2 x float> @test_vmla_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]] -// CHECK: ret <2 x float> [[ADD]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <2 x float> [[ADD]] float32x2_t test_vmla_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) { return vmla_laneq_f32(a, b, v, 0); } // CHECK-LABEL: define <4 x float> @test_vmlaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]] -// CHECK: ret <4 x float> [[ADD]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <4 x float> [[ADD]] float32x4_t test_vmlaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) { return vmlaq_laneq_f32(a, b, v, 0); } // CHECK-LABEL: define <2 x float> @test_vmls_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]] -// CHECK: ret <2 x float> [[SUB]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <2 x float> [[SUB]] float32x2_t test_vmls_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) { return vmls_lane_f32(a, b, v, 0); } // CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]] -// CHECK: ret <4 x float> [[SUB]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <4 x float> [[SUB]] float32x4_t test_vmlsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) { return vmlsq_lane_f32(a, b, v, 0); } // CHECK-LABEL: define <2 x float> @test_vmls_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]] -// CHECK: ret <2 x float> [[SUB]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <2 x float> [[SUB]] float32x2_t test_vmls_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) { return vmls_laneq_f32(a, b, v, 0); } // CHECK-LABEL: define <4 x float> @test_vmlsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]] -// CHECK: ret <4 x float> [[SUB]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <4 x float> [[SUB]] float32x4_t test_vmlsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) { return vmlsq_laneq_f32(a, b, v, 0); } // CHECK-LABEL: define <2 x float> @test_vmla_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]] -// CHECK: ret <2 x float> [[ADD]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <2 x float> [[ADD]] float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { return vmla_lane_f32(a, b, v, 1); } // CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]] -// CHECK: ret <4 x float> [[ADD]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <4 x float> [[ADD]] float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { return vmlaq_lane_f32(a, b, v, 1); } // CHECK-LABEL: define <2 x float> @test_vmla_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]] -// CHECK: ret <2 x float> [[ADD]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <2 x float> [[ADD]] float32x2_t test_vmla_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { return vmla_laneq_f32(a, b, v, 3); } // CHECK-LABEL: define <4 x float> @test_vmlaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]] -// CHECK: ret <4 x float> [[ADD]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <4 x float> [[ADD]] float32x4_t test_vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { return vmlaq_laneq_f32(a, b, v, 3); } // CHECK-LABEL: define <2 x float> @test_vmls_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]] -// CHECK: ret <2 x float> [[SUB]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <2 x float> [[SUB]] float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { return vmls_lane_f32(a, b, v, 1); } // CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]] -// CHECK: ret <4 x float> [[SUB]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <4 x float> [[SUB]] +// float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { return vmlsq_lane_f32(a, b, v, 1); } // CHECK-LABEL: define <2 x float> @test_vmls_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]] -// CHECK: ret <2 x float> [[SUB]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <2 x float> [[SUB]] float32x2_t test_vmls_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { return vmls_laneq_f32(a, b, v, 3); } // CHECK-LABEL: define <4 x float> @test_vmlsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]] -// CHECK: ret <4 x float> [[SUB]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <4 x float> [[SUB]] float32x4_t test_vmlsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { return vmlsq_laneq_f32(a, b, v, 3); } diff --git a/clang/test/CodeGen/aarch64-poly64.c b/clang/test/CodeGen/aarch64-poly64.c --- a/clang/test/CodeGen/aarch64-poly64.c +++ b/clang/test/CodeGen/aarch64-poly64.c @@ -150,22 +150,28 @@ } // CHECK-LABEL: define <1 x i64> @test_vdup_lane_p64(<1 x i64> %vec) #0 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %vec, <1 x i64> %vec, <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[VEC:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer +// CHECK: ret <1 x i64> [[LANE]] poly64x1_t test_vdup_lane_p64(poly64x1_t vec) { return vdup_lane_p64(vec, 0); } // CHECK-LABEL: define <2 x i64> @test_vdupq_lane_p64(<1 x i64> %vec) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %vec, <1 x i64> %vec, <2 x i32> zeroinitializer -// CHECK: ret <2 x i64> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[VEC:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer +// CHECK: ret <2 x i64> [[LANE]] poly64x2_t test_vdupq_lane_p64(poly64x1_t vec) { return vdupq_lane_p64(vec, 0); } // CHECK-LABEL: define <2 x i64> @test_vdupq_laneq_p64(<2 x i64> %vec) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i64> %vec, <2 x i64> %vec, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> [[VEC:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <2 x i32> +// CHECK: ret <2 x i64> [[LANE]] poly64x2_t test_vdupq_laneq_p64(poly64x2_t vec) { return vdupq_laneq_p64(vec, 1); } diff --git a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c --- a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c +++ b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c @@ -1086,32 +1086,40 @@ } // CHECK-LABEL: test_vmul_lane_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x half> %a, [[TMP0]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x half> [[A:%.*]], [[LANE]] // CHECK: ret <4 x half> [[MUL]] float16x4_t test_vmul_lane_f16(float16x4_t a, float16x4_t b) { return vmul_lane_f16(a, b, 3); } // CHECK-LABEL: test_vmulq_lane_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <8 x i32> -// CHECK: [[MUL:%.*]] = fmul <8 x half> %a, [[TMP0]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> +// CHECK: [[MUL:%.*]] = fmul <8 x half> [[A:%.*]], [[LANE]] // CHECK: ret <8 x half> [[MUL]] float16x8_t test_vmulq_lane_f16(float16x8_t a, float16x4_t b) { - return vmulq_lane_f16(a, b, 7); + return vmulq_lane_f16(a, b, 3); } // CHECK-LABEL: test_vmul_laneq_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <8 x half> %b, <8 x half> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x half> %a, [[TMP0]] +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x half> [[A:%.*]], [[LANE]] // CHECK: ret <4 x half> [[MUL]] float16x4_t test_vmul_laneq_f16(float16x4_t a, float16x8_t b) { return vmul_laneq_f16(a, b, 7); } // CHECK-LABEL: test_vmulq_laneq_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <8 x half> %b, <8 x half> %b, <8 x i32> -// CHECK: [[MUL:%.*]] = fmul <8 x half> %a, [[TMP0]] +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> +// CHECK: [[MUL:%.*]] = fmul <8 x half> [[A:%.*]], [[LANE]] // CHECK: ret <8 x half> [[MUL]] float16x8_t test_vmulq_laneq_f16(float16x8_t a, float16x8_t b) { return vmulq_laneq_f16(a, b, 7); @@ -1165,33 +1173,49 @@ } // CHECK-LABEL: test_vmulx_lane_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> %a, <4 x half> [[TMP0]]) -// CHECK: ret <4 x half> [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x half> [[LANE]] to <8 x i8> +// CHECK: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]]) #4 +// CHECK: ret <4 x half> [[VMULX2_I]] float16x4_t test_vmulx_lane_f16(float16x4_t a, float16x4_t b) { return vmulx_lane_f16(a, b, 3); } // CHECK-LABEL: test_vmulxq_lane_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <8 x i32> -// CHECK: [[MUL:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> %a, <8 x half> [[TMP0]]) -// CHECK: ret <8 x half> [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <8 x half> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x half> [[LANE]] to <16 x i8> +// CHECK: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]]) #4 +// CHECK: ret <8 x half> [[VMULX2_I]] float16x8_t test_vmulxq_lane_f16(float16x8_t a, float16x4_t b) { - return vmulxq_lane_f16(a, b, 7); + return vmulxq_lane_f16(a, b, 3); } // CHECK-LABEL: test_vmulx_laneq_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <8 x half> %b, <8 x half> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> %a, <4 x half> [[TMP0]]) -// CHECK: ret <4 x half> [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x half> [[LANE]] to <8 x i8> +// CHECK: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]]) #4 +// CHECK: ret <4 x half> [[VMULX2_I]] float16x4_t test_vmulx_laneq_f16(float16x4_t a, float16x8_t b) { return vmulx_laneq_f16(a, b, 7); } // CHECK-LABEL: test_vmulxq_laneq_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <8 x half> %b, <8 x half> %b, <8 x i32> -// CHECK: [[MUL:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> %a, <8 x half> [[TMP0]]) -// CHECK: ret <8 x half> [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <8 x half> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x half> [[LANE]] to <16 x i8> +// CHECK: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]]) #4 +// CHECK: ret <8 x half> [[VMULX2_I]] float16x8_t test_vmulxq_laneq_f16(float16x8_t a, float16x8_t b) { return vmulxq_laneq_f16(a, b, 7); } @@ -1473,17 +1497,21 @@ } // CHECK-LABEL: test_vdup_lane_f16 -// CHECK: [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <4 x i32> -// CHECK: ret <4 x half> [[SHFL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> +// CHECK: ret <4 x half> [[LANE]] float16x4_t test_vdup_lane_f16(float16x4_t a) { return vdup_lane_f16(a, 3); } // CHECK-LABEL: test_vdupq_lane_f16 -// CHECK: [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <8 x i32> -// CHECK: ret <8 x half> [[SHFL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> +// CHECK: ret <8 x half> [[LANE]] float16x8_t test_vdupq_lane_f16(float16x4_t a) { - return vdupq_lane_f16(a, 7); + return vdupq_lane_f16(a, 3); } // CHECK-LABEL: @test_vext_f16( diff --git a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c --- a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c +++ b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c @@ -773,19 +773,23 @@ } // CHECK-LABEL: test_vmul_lane_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x half> %a, [[TMP0]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x half> [[A:%.*]], [[LANE]] // CHECK: ret <4 x half> [[MUL]] float16x4_t test_vmul_lane_f16(float16x4_t a, float16x4_t b) { return vmul_lane_f16(a, b, 3); } // CHECK-LABEL: test_vmulq_lane_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <8 x i32> -// CHECK: [[MUL:%.*]] = fmul <8 x half> %a, [[TMP0]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> +// CHECK: [[MUL:%.*]] = fmul <8 x half> [[A:%.*]], [[LANE]] // CHECK: ret <8 x half> [[MUL]] float16x8_t test_vmulq_lane_f16(float16x8_t a, float16x4_t b) { - return vmulq_lane_f16(a, b, 7); + return vmulq_lane_f16(a, b, 3); } // CHECK-LABEL: test_vmul_n_f16 @@ -939,17 +943,21 @@ } // CHECK-LABEL: test_vdup_lane_f16 -// CHECK: [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <4 x i32> -// CHECK: ret <4 x half> [[SHFL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> +// CHECK: ret <4 x half> [[LANE]] float16x4_t test_vdup_lane_f16(float16x4_t a) { return vdup_lane_f16(a, 3); } // CHECK-LABEL: test_vdupq_lane_f16 -// CHECK: [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <8 x i32> -// CHECK: ret <8 x half> [[SHFL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> +// CHECK: ret <8 x half> [[LANE]] float16x8_t test_vdupq_lane_f16(float16x4_t a) { - return vdupq_lane_f16(a, 7); + return vdupq_lane_f16(a, 3); } // CHECK-LABEL: @test_vext_f16( diff --git a/clang/test/CodeGen/arm64_vdupq_n_f64.c b/clang/test/CodeGen/arm64_vdupq_n_f64.c --- a/clang/test/CodeGen/arm64_vdupq_n_f64.c +++ b/clang/test/CodeGen/arm64_vdupq_n_f64.c @@ -28,7 +28,9 @@ // this was in , but had already been implemented, // test anyway // CHECK-LABEL: define <2 x double> @test_vdupq_lane_f64(<1 x double> %V) #0 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x double> %V, <1 x double> %V, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %V to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer // CHECK: ret <2 x double> [[SHUFFLE]] float64x2_t test_vdupq_lane_f64(float64x1_t V) { return vdupq_lane_f64(V, 0); diff --git a/clang/test/CodeGen/arm_neon_intrinsics.c b/clang/test/CodeGen/arm_neon_intrinsics.c --- a/clang/test/CodeGen/arm_neon_intrinsics.c +++ b/clang/test/CodeGen/arm_neon_intrinsics.c @@ -2419,15 +2419,19 @@ } // CHECK-LABEL: @test_vdup_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: ret <4 x i16> [[LANE]] uint16x4_t test_vdup_lane_u16(uint16x4_t a) { return vdup_lane_u16(a, 3); } // CHECK-LABEL: @test_vdup_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: ret <2 x i32> [[LANE]] uint32x2_t test_vdup_lane_u32(uint32x2_t a) { return vdup_lane_u32(a, 1); } @@ -2440,15 +2444,19 @@ } // CHECK-LABEL: @test_vdup_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: ret <4 x i16> [[LANE]] int16x4_t test_vdup_lane_s16(int16x4_t a) { return vdup_lane_s16(a, 3); } // CHECK-LABEL: @test_vdup_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: ret <2 x i32> [[LANE]] int32x2_t test_vdup_lane_s32(int32x2_t a) { return vdup_lane_s32(a, 1); } @@ -2461,15 +2469,19 @@ } // CHECK-LABEL: @test_vdup_lane_p16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: ret <4 x i16> [[LANE]] poly16x4_t test_vdup_lane_p16(poly16x4_t a) { return vdup_lane_p16(a, 3); } // CHECK-LABEL: @test_vdup_lane_f32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> -// CHECK: ret <2 x float> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK: ret <2 x float> [[LANE]] float32x2_t test_vdup_lane_f32(float32x2_t a) { return vdup_lane_f32(a, 1); } @@ -2482,15 +2494,19 @@ } // CHECK-LABEL: @test_vdupq_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: ret <8 x i16> [[LANE]] uint16x8_t test_vdupq_lane_u16(uint16x4_t a) { return vdupq_lane_u16(a, 3); } // CHECK-LABEL: @test_vdupq_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: ret <4 x i32> [[LANE]] uint32x4_t test_vdupq_lane_u32(uint32x2_t a) { return vdupq_lane_u32(a, 1); } @@ -2503,15 +2519,19 @@ } // CHECK-LABEL: @test_vdupq_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: ret <8 x i16> [[LANE]] int16x8_t test_vdupq_lane_s16(int16x4_t a) { return vdupq_lane_s16(a, 3); } // CHECK-LABEL: @test_vdupq_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: ret <4 x i32> [[LANE]] int32x4_t test_vdupq_lane_s32(int32x2_t a) { return vdupq_lane_s32(a, 1); } @@ -2524,43 +2544,55 @@ } // CHECK-LABEL: @test_vdupq_lane_p16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: ret <8 x i16> [[LANE]] poly16x8_t test_vdupq_lane_p16(poly16x4_t a) { return vdupq_lane_p16(a, 3); } // CHECK-LABEL: @test_vdupq_lane_f32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <4 x i32> -// CHECK: ret <4 x float> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK: ret <4 x float> [[LANE]] float32x4_t test_vdupq_lane_f32(float32x2_t a) { return vdupq_lane_f32(a, 1); } // CHECK-LABEL: @test_vdup_lane_s64( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer +// CHECK: ret <1 x i64> [[LANE]] int64x1_t test_vdup_lane_s64(int64x1_t a) { return vdup_lane_s64(a, 0); } // CHECK-LABEL: @test_vdup_lane_u64( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer +// CHECK: ret <1 x i64> [[LANE]] uint64x1_t test_vdup_lane_u64(uint64x1_t a) { return vdup_lane_u64(a, 0); } // CHECK-LABEL: @test_vdupq_lane_s64( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer -// CHECK: ret <2 x i64> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer +// CHECK: ret <2 x i64> [[LANE]] int64x2_t test_vdupq_lane_s64(int64x1_t a) { return vdupq_lane_s64(a, 0); } // CHECK-LABEL: @test_vdupq_lane_u64( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer -// CHECK: ret <2 x i64> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer +// CHECK: ret <2 x i64> [[LANE]] uint64x2_t test_vdupq_lane_u64(uint64x1_t a) { return vdupq_lane_u64(a, 0); } @@ -7077,44 +7109,52 @@ } // CHECK-LABEL: @test_vmlal_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) -// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #8 +// CHECK: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vmlal_lane_s16(a, b, c, 3); } // CHECK-LABEL: @test_vmlal_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) -// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #8 +// CHECK: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK: ret <2 x i64> [[ADD]] int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vmlal_lane_s32(a, b, c, 1); } // CHECK-LABEL: @test_vmlal_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) -// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #8 +// CHECK: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK: ret <4 x i32> [[ADD]] uint32x4_t test_vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vmlal_lane_u16(a, b, c, 3); } // CHECK-LABEL: @test_vmlal_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) -// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #8 +// CHECK: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK: ret <2 x i64> [[ADD]] uint64x2_t test_vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vmlal_lane_u32(a, b, c, 1); @@ -7173,90 +7213,110 @@ } // CHECK-LABEL: @test_vmla_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK: ret <4 x i16> [[ADD]] int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) { return vmla_lane_s16(a, b, c, 3); } // CHECK-LABEL: @test_vmla_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK: ret <2 x i32> [[ADD]] int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) { return vmla_lane_s32(a, b, c, 1); } // CHECK-LABEL: @test_vmla_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK: ret <4 x i16> [[ADD]] uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vmla_lane_u16(a, b, c, 3); } // CHECK-LABEL: @test_vmla_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK: ret <2 x i32> [[ADD]] uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vmla_lane_u32(a, b, c, 1); } // CHECK-LABEL: @test_vmla_lane_f32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]] // CHECK: ret <2 x float> [[ADD]] float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) { return vmla_lane_f32(a, b, c, 1); } // CHECK-LABEL: @test_vmlaq_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK: ret <8 x i16> [[ADD]] int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) { return vmlaq_lane_s16(a, b, c, 3); } // CHECK-LABEL: @test_vmlaq_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) { return vmlaq_lane_s32(a, b, c, 1); } // CHECK-LABEL: @test_vmlaq_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK: ret <8 x i16> [[ADD]] uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) { return vmlaq_lane_u16(a, b, c, 3); } // CHECK-LABEL: @test_vmlaq_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK: ret <4 x i32> [[ADD]] uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) { return vmlaq_lane_u32(a, b, c, 1); } // CHECK-LABEL: @test_vmlaq_lane_f32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]] // CHECK: ret <4 x float> [[ADD]] float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) { return vmlaq_lane_f32(a, b, c, 1); @@ -7553,44 +7613,52 @@ } // CHECK-LABEL: @test_vmlsl_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) -// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #8 +// CHECK: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vmlsl_lane_s16(a, b, c, 3); } // CHECK-LABEL: @test_vmlsl_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) -// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #8 +// CHECK: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK: ret <2 x i64> [[SUB]] int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vmlsl_lane_s32(a, b, c, 1); } // CHECK-LABEL: @test_vmlsl_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) -// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #8 +// CHECK: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK: ret <4 x i32> [[SUB]] uint32x4_t test_vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vmlsl_lane_u16(a, b, c, 3); } // CHECK-LABEL: @test_vmlsl_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) -// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #8 +// CHECK: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK: ret <2 x i64> [[SUB]] uint64x2_t test_vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vmlsl_lane_u32(a, b, c, 1); @@ -7649,90 +7717,110 @@ } // CHECK-LABEL: @test_vmls_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK: ret <4 x i16> [[SUB]] int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) { return vmls_lane_s16(a, b, c, 3); } // CHECK-LABEL: @test_vmls_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK: ret <2 x i32> [[SUB]] int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) { return vmls_lane_s32(a, b, c, 1); } // CHECK-LABEL: @test_vmls_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK: ret <4 x i16> [[SUB]] uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vmls_lane_u16(a, b, c, 3); } // CHECK-LABEL: @test_vmls_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK: ret <2 x i32> [[SUB]] uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vmls_lane_u32(a, b, c, 1); } // CHECK-LABEL: @test_vmls_lane_f32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]] // CHECK: ret <2 x float> [[SUB]] float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) { return vmls_lane_f32(a, b, c, 1); } // CHECK-LABEL: @test_vmlsq_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK: ret <8 x i16> [[SUB]] int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) { return vmlsq_lane_s16(a, b, c, 3); } // CHECK-LABEL: @test_vmlsq_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) { return vmlsq_lane_s32(a, b, c, 1); } // CHECK-LABEL: @test_vmlsq_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK: ret <8 x i16> [[SUB]] uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) { return vmlsq_lane_u16(a, b, c, 3); } // CHECK-LABEL: @test_vmlsq_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK: ret <4 x i32> [[SUB]] uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) { return vmlsq_lane_u32(a, b, c, 1); } // CHECK-LABEL: @test_vmlsq_lane_f32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]] // CHECK: ret <4 x float> [[SUB]] float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) { return vmlsq_lane_f32(a, b, c, 1); @@ -8404,40 +8492,48 @@ } // CHECK-LABEL: @test_vmull_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #8 // CHECK: ret <4 x i32> [[VMULL2_I]] int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t b) { return vmull_lane_s16(a, b, 3); } // CHECK-LABEL: @test_vmull_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #8 // CHECK: ret <2 x i64> [[VMULL2_I]] int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t b) { return vmull_lane_s32(a, b, 1); } // CHECK-LABEL: @test_vmull_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #8 // CHECK: ret <4 x i32> [[VMULL2_I]] uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t b) { return vmull_lane_u16(a, b, 3); } // CHECK-LABEL: @test_vmull_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #8 // CHECK: ret <2 x i64> [[VMULL2_I]] uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t b) { return vmull_lane_u32(a, b, 1); @@ -8506,80 +8602,100 @@ } // CHECK-LABEL: @test_vmul_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK: ret <4 x i16> [[MUL]] int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t b) { return vmul_lane_s16(a, b, 3); } // CHECK-LABEL: @test_vmul_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK: ret <2 x i32> [[MUL]] int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t b) { return vmul_lane_s32(a, b, 1); } // CHECK-LABEL: @test_vmul_lane_f32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <2 x i32> -// CHECK: [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] // CHECK: ret <2 x float> [[MUL]] float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t b) { return vmul_lane_f32(a, b, 1); } // CHECK-LABEL: @test_vmul_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK: ret <4 x i16> [[MUL]] uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t b) { return vmul_lane_u16(a, b, 3); } // CHECK-LABEL: @test_vmul_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK: ret <2 x i32> [[MUL]] uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t b) { return vmul_lane_u32(a, b, 1); } // CHECK-LABEL: @test_vmulq_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK: ret <8 x i16> [[MUL]] int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t b) { return vmulq_lane_s16(a, b, 3); } // CHECK-LABEL: @test_vmulq_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK: ret <4 x i32> [[MUL]] int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t b) { return vmulq_lane_s32(a, b, 1); } // CHECK-LABEL: @test_vmulq_lane_f32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] // CHECK: ret <4 x float> [[MUL]] float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t b) { return vmulq_lane_f32(a, b, 1); } // CHECK-LABEL: @test_vmulq_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK: ret <8 x i16> [[MUL]] uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t b) { return vmulq_lane_u16(a, b, 3); } // CHECK-LABEL: @test_vmulq_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK: ret <4 x i32> [[MUL]] uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t b) { return vmulq_lane_u32(a, b, 1); @@ -9700,24 +9816,28 @@ } // CHECK-LABEL: @test_vqdmlal_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) -// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #8 +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #8 // CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vqdmlal_lane_s16(a, b, c, 3); } // CHECK-LABEL: @test_vqdmlal_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) -// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #8 +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #8 // CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vqdmlal_lane_s32(a, b, c, 1); @@ -9774,24 +9894,28 @@ } // CHECK-LABEL: @test_vqdmlsl_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) -// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #8 +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #8 // CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vqdmlsl_lane_s16(a, b, c, 3); } // CHECK-LABEL: @test_vqdmlsl_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) -// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #8 +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #8 // CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vqdmlsl_lane_s32(a, b, c, 1); @@ -9866,10 +9990,12 @@ } // CHECK-LABEL: @test_vqdmulh_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[LANE]]) #8 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> // CHECK: ret <4 x i16> [[VQDMULH_V2_I]] int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t b) { @@ -9877,10 +10003,12 @@ } // CHECK-LABEL: @test_vqdmulh_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[LANE]]) #8 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> // CHECK: ret <2 x i32> [[VQDMULH_V2_I]] int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t b) { @@ -9888,10 +10016,12 @@ } // CHECK-LABEL: @test_vqdmulhq_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> -// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8> +// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[LANE]]) #8 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> // CHECK: ret <8 x i16> [[VQDMULHQ_V2_I]] int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t b) { @@ -9899,10 +10029,12 @@ } // CHECK-LABEL: @test_vqdmulhq_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> -// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[LANE]]) #8 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> // CHECK: ret <4 x i32> [[VQDMULHQ_V2_I]] int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t b) { @@ -9988,10 +10120,12 @@ } // CHECK-LABEL: @test_vqdmull_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #8 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK: ret <4 x i32> [[VQDMULL_V2_I]] int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t b) { @@ -9999,10 +10133,12 @@ } // CHECK-LABEL: @test_vqdmull_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #8 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK: ret <2 x i64> [[VQDMULL_V2_I]] int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t b) { @@ -10204,10 +10340,12 @@ } // CHECK-LABEL: @test_vqrdmulh_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[LANE]]) #8 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> // CHECK: ret <4 x i16> [[VQRDMULH_V2_I]] int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t b) { @@ -10215,10 +10353,12 @@ } // CHECK-LABEL: @test_vqrdmulh_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[LANE]]) #8 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> // CHECK: ret <2 x i32> [[VQRDMULH_V2_I]] int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t b) { @@ -10226,10 +10366,12 @@ } // CHECK-LABEL: @test_vqrdmulhq_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> -// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8> +// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[LANE]]) #8 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> // CHECK: ret <8 x i16> [[VQRDMULHQ_V2_I]] int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t b) { @@ -10237,10 +10379,12 @@ } // CHECK-LABEL: @test_vqrdmulhq_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> -// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[LANE]]) #8 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> // CHECK: ret <4 x i32> [[VQRDMULHQ_V2_I]] int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t b) { diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -27,6 +27,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/None.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" @@ -238,6 +239,11 @@ NumVectors = 1; } + void make32BitElement() { + assert_with_loc(Bitwidth > 32, "Not enough bits to make it 32!"); + ElementBitwidth = 32; + } + void doubleLanes() { assert_with_loc(Bitwidth != 128, "Can't get bigger than 128!"); Bitwidth = 128; @@ -518,7 +524,7 @@ std::pair emitDagDupTyped(DagInit *DI); std::pair emitDagShuffle(DagInit *DI); std::pair emitDagCast(DagInit *DI, bool IsBitCast); - std::pair emitDagCall(DagInit *DI); + std::pair emitDagCall(DagInit *DI, bool MatchMangledName); std::pair emitDagNameReplace(DagInit *DI); std::pair emitDagLiteral(DagInit *DI); std::pair emitDagOp(DagInit *DI); @@ -546,7 +552,8 @@ public: /// Called by Intrinsic - this attempts to get an intrinsic that takes /// the given types as arguments. - Intrinsic &getIntrinsic(StringRef Name, ArrayRef Types); + Intrinsic &getIntrinsic(StringRef Name, ArrayRef Types, + Optional MangledName); /// Called by Intrinsic - returns a globally-unique number. unsigned getUniqueNumber() { return UniqueNumber++; } @@ -1383,8 +1390,8 @@ return emitDagSaveTemp(DI); if (Op == "op") return emitDagOp(DI); - if (Op == "call") - return emitDagCall(DI); + if (Op == "call" || Op == "call_mangled") + return emitDagCall(DI, Op == "call_mangled"); if (Op == "name_replace") return emitDagNameReplace(DI); if (Op == "literal") @@ -1411,7 +1418,8 @@ } } -std::pair Intrinsic::DagEmitter::emitDagCall(DagInit *DI) { +std::pair Intrinsic::DagEmitter::emitDagCall(DagInit *DI, + bool MatchMangledName) { std::vector Types; std::vector Values; for (unsigned I = 0; I < DI->getNumArgs() - 1; ++I) { @@ -1427,7 +1435,12 @@ N = SI->getAsUnquotedString(); else N = emitDagArg(DI->getArg(0), "").second; - Intrinsic &Callee = Intr.Emitter.getIntrinsic(N, Types); + Optional MangledName; + if (MatchMangledName) { + if (Intr.getRecord()->getValueAsBit("isLaneQ")) N += "q"; + MangledName = Intr.mangleName(N, ClassS); + } + Intrinsic &Callee = Intr.Emitter.getIntrinsic(N, Types, MangledName); // Make sure the callee is known as an early def. Callee.setNeededEarly(); @@ -1486,6 +1499,8 @@ castToType.doubleLanes(); } else if (SI->getAsUnquotedString() == "8") { castToType.makeInteger(8, true); + } else if (SI->getAsUnquotedString() == "32") { + castToType.make32BitElement(); } else { castToType = Type::fromTypedefName(SI->getAsUnquotedString()); assert_with_loc(!castToType.isVoid(), "Unknown typedef"); @@ -1832,7 +1847,8 @@ // NeonEmitter implementation //===----------------------------------------------------------------------===// -Intrinsic &NeonEmitter::getIntrinsic(StringRef Name, ArrayRef Types) { +Intrinsic &NeonEmitter::getIntrinsic(StringRef Name, ArrayRef Types, + Optional MangledName) { // First, look up the name in the intrinsic map. assert_with_loc(IntrinsicMap.find(Name.str()) != IntrinsicMap.end(), ("Intrinsic '" + Name + "' not found!").str()); @@ -1871,6 +1887,9 @@ break; } } + if (MangledName) + Good &= I.getMangledName(true) == MangledName; + if (Good) GoodVec.push_back(&I); }