Index: include/clang/Basic/arm_neon.td =================================================================== --- include/clang/Basic/arm_neon.td +++ include/clang/Basic/arm_neon.td @@ -40,16 +40,45 @@ def OP_MLAL_N : Op; def OP_MLSL_N : Op; def OP_MUL_LN: Op; +def OP_MUL_LNQ: Op; +def OP_MULX_LN: Op; +def OP_MULX_LNQ: Op; def OP_MULL_LN : Op; +def OP_MULLHi_LN : Op; +def OP_MULL_LNQ : Op; +def OP_MULLHi_LNQ : Op; def OP_MLA_LN: Op; +def OP_MLA_LNQ: Op; def OP_MLS_LN: Op; +def OP_MLS_LNQ: Op; def OP_MLAL_LN : Op; +def OP_MLAL_LNQ : Op; +def OP_MLALHi_LN : Op; +def OP_MLALHi_LNQ : Op; def OP_MLSL_LN : Op; +def OP_MLSL_LNQ : Op; +def OP_MLSLHi_LN : Op; +def OP_MLSLHi_LNQ : Op; def OP_QDMULL_LN : Op; +def OP_QDMULLHi_LN : Op; +def OP_QDMULL_LNQ : Op; +def OP_QDMULLHi_LNQ : Op; def OP_QDMLAL_LN : Op; +def OP_QDMLAL_LNQ: Op; +def OP_QDMLALHi_LN : Op; +def OP_QDMLALHi_LNQ : Op; def OP_QDMLSL_LN : Op; +def OP_QDMLSL_LNQ: Op; +def OP_QDMLSLHi_LN : Op; +def OP_QDMLSLHi_LNQ : Op; def OP_QDMULH_LN : Op; +def OP_QDMULH_LNQ : Op; def OP_QRDMULH_LN : Op; +def OP_QRDMULH_LNQ : Op; +def OP_FMA_LN : Op; +def OP_FMA_LNQ : Op; +def OP_FMS_LN : Op; +def OP_FMS_LNQ : Op; def OP_EQ : Op; def OP_GE : Op; def OP_LE : Op; @@ -146,6 +175,7 @@ // f: float (int args) // d: default // g: default, ignore 'Q' size modifier. +// j: default, force 'Q' size modifier. // w: double width elements, same num elts // n: double width elements, half num elts // h: half width elements, double num elts @@ -503,7 +533,7 @@ //////////////////////////////////////////////////////////////////////////////// // Multiplication Extended -def MULX : SInst<"vmulx", "ddd", "fQfQd">; +def MULX : SInst<"vmulx", "ddd", "fdQfQd">; //////////////////////////////////////////////////////////////////////////////// // Division @@ -630,6 +660,63 @@ def VQDMLSL_HIGH : SOpInst<"vqdmlsl_high", "wwkk", "si", OP_QDMLSLHi>; //////////////////////////////////////////////////////////////////////////////// + +def VMLA_LANEQ : IOpInst<"vmla_laneq", "dddji", + "siUsUifQsQiQUsQUiQf", OP_MLA_LNQ>; +def VMLS_LANEQ : IOpInst<"vmls_laneq", "dddji", + "siUsUifQsQiQUsQUiQf", OP_MLS_LNQ>; + +def VFMA_LANE : IOpInst<"vfma_lane", "dddgi", "fdQfQd", OP_FMA_LN>; +def VFMA_LANEQ : IOpInst<"vfma_laneq", "dddji", "fdQfQd", OP_FMA_LNQ>; +def VFMS_LANE : IOpInst<"vfms_lane", "dddgi", "fdQfQd", OP_FMS_LN>; +def VFMS_LANEQ : IOpInst<"vfms_laneq", "dddji", "fdQfQd", OP_FMS_LNQ>; + +def VMLAL_LANEQ : SOpInst<"vmlal_laneq", "wwdki", "siUsUi", OP_MLAL_LNQ>; +def VMLAL_HIGH_LANE : SOpInst<"vmlal_high_lane", "wwkdi", "siUsUi", + OP_MLALHi_LN>; +def VMLAL_HIGH_LANEQ : SOpInst<"vmlal_high_laneq", "wwkki", "siUsUi", + OP_MLALHi_LNQ>; +def VMLSL_LANEQ : SOpInst<"vmlsl_laneq", "wwdki", "siUsUi", OP_MLSL_LNQ>; +def VMLSL_HIGH_LANE : SOpInst<"vmlsl_high_lane", "wwkdi", "siUsUi", + OP_MLSLHi_LN>; +def VMLSL_HIGH_LANEQ : SOpInst<"vmlsl_high_laneq", "wwkki", "siUsUi", + OP_MLSLHi_LNQ>; + +def VQDMLAL_LANEQ : SOpInst<"vqdmlal_laneq", "wwdki", "si", OP_QDMLAL_LNQ>; +def VQDMLAL_HIGH_LANE : SOpInst<"vqdmlal_high_lane", "wwkdi", "si", + OP_QDMLALHi_LN>; +def VQDMLAL_HIGH_LANEQ : SOpInst<"vqdmlal_high_laneq", "wwkki", "si", + OP_QDMLALHi_LNQ>; +def VQDMLSL_LANEQ : SOpInst<"vqdmlsl_laneq", "wwdki", "si", OP_QDMLSL_LNQ>; +def VQDMLSL_HIGH_LANE : SOpInst<"vqdmlsl_high_lane", "wwkdi", "si", + OP_QDMLSLHi_LN>; +def VQDMLSL_HIGH_LANEQ : SOpInst<"vqdmlsl_high_laneq", "wwkki", "si", + OP_QDMLSLHi_LNQ>; + +// Newly add double parameter for vmul_lane in aarch64 +def VMUL_LANE_A64 : IOpInst<"vmul_lane", "ddgi", "dQd", OP_MUL_LN>; + +def VMUL_LANEQ : IOpInst<"vmul_laneq", "ddji", + "sifdUsUiQsQiQfQUsQUiQfQd", OP_MUL_LNQ>; +def VMULL_LANEQ : SOpInst<"vmull_laneq", "wdki", "siUsUi", OP_MULL_LNQ>; +def VMULL_HIGH_LANE : SOpInst<"vmull_high_lane", "wkdi", "siUsUi", + OP_MULLHi_LN>; +def VMULL_HIGH_LANEQ : SOpInst<"vmull_high_laneq", "wkki", "siUsUi", + OP_MULLHi_LNQ>; + +def VQDMULL_LANEQ : SOpInst<"vqdmull_laneq", "wdki", "si", OP_QDMULL_LNQ>; +def VQDMULL_HIGH_LANE : SOpInst<"vqdmull_high_lane", "wkdi", "si", + OP_QDMULLHi_LN>; +def VQDMULL_HIGH_LANEQ : SOpInst<"vqdmull_high_laneq", "wkki", "si", + OP_QDMULLHi_LNQ>; + +def VQDMULH_LANEQ : SOpInst<"vqdmulh_laneq", "ddji", "siQsQi", OP_QDMULH_LNQ>; +def VQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "ddji", "siQsQi", OP_QRDMULH_LNQ>; + +def VMULX_LANE : IOpInst<"vmulx_lane", "ddgi", "fdQfQd", OP_MULX_LN>; +def VMULX_LANEQ : IOpInst<"vmulx_laneq", "ddji", "fdQfQd", OP_MULX_LNQ>; + +//////////////////////////////////////////////////////////////////////////////// // Scalar Arithmetic // Scalar Addition Index: test/CodeGen/aarch64-neon-2velem.c =================================================================== --- /dev/null +++ test/CodeGen/aarch64-neon-2velem.c @@ -0,0 +1,802 @@ +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \ +// RUN: -S -O3 -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \ +// RUN: -ffp-contract=fast -S -O3 -o - %s | FileCheck %s + +// Test new aarch64 intrinsics and types + +#include + +int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) { + // CHECK: test_vmla_lane_s16 + return vmla_lane_s16(a, b, v, 1); + // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) { + // CHECK: test_vmlaq_lane_s16 + return vmlaq_lane_s16(a, b, v, 1); + // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) { + // CHECK: test_vmla_lane_s32 + return vmla_lane_s32(a, b, v, 1); + // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) { + // CHECK: test_vmlaq_lane_s32 + return vmlaq_lane_s32(a, b, v, 1); + // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { + // CHECK: test_vmla_laneq_s16 + return vmla_laneq_s16(a, b, v, 1); + // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { + // CHECK: test_vmlaq_laneq_s16 + return vmlaq_laneq_s16(a, b, v, 1); + // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { + // CHECK: test_vmla_laneq_s32 + return vmla_laneq_s32(a, b, v, 1); + // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) { + // CHECK: test_vmlaq_laneq_s32 + return vmlaq_laneq_s32(a, b, v, 1); + // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) { + // CHECK: test_vmls_lane_s16 + return vmls_lane_s16(a, b, v, 1); + // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) { + // CHECK: test_vmlsq_lane_s16 + return vmlsq_lane_s16(a, b, v, 1); + // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) { + // CHECK: test_vmls_lane_s32 + return vmls_lane_s32(a, b, v, 1); + // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) { + // CHECK: test_vmlsq_lane_s32 + return vmlsq_lane_s32(a, b, v, 1); + // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { + // CHECK: test_vmls_laneq_s16 + return vmls_laneq_s16(a, b, v, 1); + // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { + // CHECK: test_vmlsq_laneq_s16 + return vmlsq_laneq_s16(a, b, v, 1); + // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { + // CHECK: test_vmls_laneq_s32 + return vmls_laneq_s32(a, b, v, 1); + // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) { + // CHECK: test_vmlsq_laneq_s32 + return vmlsq_laneq_s32(a, b, v, 1); + // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) { + // CHECK: test_vmul_lane_s16 + return vmul_lane_s16(a, v, 1); + // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) { + // CHECK: test_vmulq_lane_s16 + return vmulq_lane_s16(a, v, 1); + // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) { + // CHECK: test_vmul_lane_s32 + return vmul_lane_s32(a, v, 1); + // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) { + // CHECK: test_vmulq_lane_s32 + return vmulq_lane_s32(a, v, 1); + // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) { + // CHECK: test_vmul_lane_u16 + return vmul_lane_u16(a, v, 1); + // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) { + // CHECK: test_vmulq_lane_u16 + return vmulq_lane_u16(a, v, 1); + // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) { + // CHECK: test_vmul_lane_u32 + return vmul_lane_u32(a, v, 1); + // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) { + // CHECK: test_vmulq_lane_u32 + return vmulq_lane_u32(a, v, 1); + // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) { + // CHECK: test_vmul_laneq_s16 + return vmul_laneq_s16(a, v, 1); + // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) { + // CHECK: test_vmulq_laneq_s16 + return vmulq_laneq_s16(a, v, 1); + // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) { + // CHECK: test_vmul_laneq_s32 + return vmul_laneq_s32(a, v, 1); + // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) { + // CHECK: test_vmulq_laneq_s32 + return vmulq_laneq_s32(a, v, 1); + // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) { + // CHECK: test_vmul_laneq_u16 + return vmul_laneq_u16(a, v, 1); + // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) { + // CHECK: test_vmulq_laneq_u16 + return vmulq_laneq_u16(a, v, 1); + // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) { + // CHECK: test_vmul_laneq_u32 + return vmul_laneq_u32(a, v, 1); + // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) { + // CHECK: test_vmulq_laneq_u32 + return vmulq_laneq_u32(a, v, 1); + // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { + // CHECK: test_vfma_lane_f32 + return vfma_lane_f32(a, b, v, 1); + // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { + // CHECK: test_vfmaq_lane_f32 + return vfmaq_lane_f32(a, b, v, 1); + // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { + // CHECK: test_vfma_laneq_f32 + return vfma_laneq_f32(a, b, v, 1); + // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { + // CHECK: test_vfmaq_laneq_f32 + return vfmaq_laneq_f32(a, b, v, 1); + // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { + // CHECK: test_vfms_lane_f32 + return vfms_lane_f32(a, b, v, 1); + // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { + // CHECK: test_vfmsq_lane_f32 + return vfmsq_lane_f32(a, b, v, 1); + // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { + // CHECK: test_vfms_laneq_f32 + return vfms_laneq_f32(a, b, v, 1); + // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { + // CHECK: test_vfmsq_laneq_f32 + return vfmsq_laneq_f32(a, b, v, 1); + // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) { + // CHECK: test_vfmaq_lane_f64 + return vfmaq_lane_f64(a, b, v, 0); + // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +} + +float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) { + // CHECK: test_vfmaq_laneq_f64 + return vfmaq_laneq_f64(a, b, v, 0); + // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +} + +float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) { + // CHECK: test_vfmaq_laneq_f64 + return vfmaq_laneq_f64(a, b, v, 1); + // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +} + +float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) { + // CHECK: test_vfmsq_lane_f64 + return vfmsq_lane_f64(a, b, v, 0); + // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +} + +float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) { + // CHECK: test_vfmsq_laneq_f64 + return vfmsq_laneq_f64(a, b, v, 0); + // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +} + +float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) { + // CHECK: test_vfmsq_laneq_f64 + return vfmsq_laneq_f64(a, b, v, 1); + // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +} + +int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { + // CHECK: test_vmlal_lane_s16 + return vmlal_lane_s16(a, b, v, 1); + // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { + // CHECK: test_vmlal_lane_s32 + return vmlal_lane_s32(a, b, v, 1); + // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { + // CHECK: test_vmlal_laneq_s16 + return vmlal_laneq_s16(a, b, v, 1); + // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { + // CHECK: test_vmlal_laneq_s32 + return vmlal_laneq_s32(a, b, v, 1); + // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { + // CHECK: test_vmlal_high_lane_s16 + return vmlal_high_lane_s16(a, b, v, 1); + // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { + // CHECK: test_vmlal_high_lane_s32 + return vmlal_high_lane_s32(a, b, v, 1); + // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { + // CHECK: test_vmlal_high_laneq_s16 + return vmlal_high_laneq_s16(a, b, v, 1); + // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { + // CHECK: test_vmlal_high_laneq_s32 + return vmlal_high_laneq_s32(a, b, v, 1); + // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { + // CHECK: test_vmlsl_lane_s16 + return vmlsl_lane_s16(a, b, v, 1); + // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { + // CHECK: test_vmlsl_lane_s32 + return vmlsl_lane_s32(a, b, v, 1); + // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { + // CHECK: test_vmlsl_laneq_s16 + return vmlsl_laneq_s16(a, b, v, 1); + // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { + // CHECK: test_vmlsl_laneq_s32 + return vmlsl_laneq_s32(a, b, v, 1); + // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { + // CHECK: test_vmlsl_high_lane_s16 + return vmlsl_high_lane_s16(a, b, v, 1); + // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { + // CHECK: test_vmlsl_high_lane_s32 + return vmlsl_high_lane_s32(a, b, v, 1); + // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { + // CHECK: test_vmlsl_high_laneq_s16 + return vmlsl_high_laneq_s16(a, b, v, 1); + // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { + // CHECK: test_vmlsl_high_laneq_s32 + return vmlsl_high_laneq_s32(a, b, v, 1); + // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) { + // CHECK: test_vmlal_lane_u16 + return vmlal_lane_u16(a, b, v, 1); + // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) { + // CHECK: test_vmlal_lane_u32 + return vmlal_lane_u32(a, b, v, 1); + // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) { + // CHECK: test_vmlal_laneq_u16 + return vmlal_laneq_u16(a, b, v, 1); + // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) { + // CHECK: test_vmlal_laneq_u32 + return vmlal_laneq_u32(a, b, v, 1); + // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) { + // CHECK: test_vmlal_high_lane_u16 + return vmlal_high_lane_u16(a, b, v, 1); + // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) { + // CHECK: test_vmlal_high_lane_u32 + return vmlal_high_lane_u32(a, b, v, 1); + // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) { + // CHECK: test_vmlal_high_laneq_u16 + return vmlal_high_laneq_u16(a, b, v, 1); + // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) { + // CHECK: test_vmlal_high_laneq_u32 + return vmlal_high_laneq_u32(a, b, v, 1); + // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) { + // CHECK: test_vmlsl_lane_u16 + return vmlsl_lane_u16(a, b, v, 1); + // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) { + // CHECK: test_vmlsl_lane_u32 + return vmlsl_lane_u32(a, b, v, 1); + // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) { + // CHECK: test_vmlsl_laneq_u16 + return vmlsl_laneq_u16(a, b, v, 1); + // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) { + // CHECK: test_vmlsl_laneq_u32 + return vmlsl_laneq_u32(a, b, v, 1); + // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) { + // CHECK: test_vmlsl_high_lane_u16 + return vmlsl_high_lane_u16(a, b, v, 1); + // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) { + // CHECK: test_vmlsl_high_lane_u32 + return vmlsl_high_lane_u32(a, b, v, 1); + // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) { + // CHECK: test_vmlsl_high_laneq_u16 + return vmlsl_high_laneq_u16(a, b, v, 1); + // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) { + // CHECK: test_vmlsl_high_laneq_u32 + return vmlsl_high_laneq_u32(a, b, v, 1); + // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) { + // CHECK: test_vmull_lane_s16 + return vmull_lane_s16(a, v, 1); + // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) { + // CHECK: test_vmull_lane_s32 + return vmull_lane_s32(a, v, 1); + // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) { + // CHECK: test_vmull_lane_u16 + return vmull_lane_u16(a, v, 1); + // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) { + // CHECK: test_vmull_lane_u32 + return vmull_lane_u32(a, v, 1); + // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) { + // CHECK: test_vmull_high_lane_s16 + return vmull_high_lane_s16(a, v, 1); + // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) { + // CHECK: test_vmull_high_lane_s32 + return vmull_high_lane_s32(a, v, 1); + // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) { + // CHECK: test_vmull_high_lane_u16 + return vmull_high_lane_u16(a, v, 1); + // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) { + // CHECK: test_vmull_high_lane_u32 + return vmull_high_lane_u32(a, v, 1); + // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) { + // CHECK: test_vmull_laneq_s16 + return vmull_laneq_s16(a, v, 1); + // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) { + // CHECK: test_vmull_laneq_s32 + return vmull_laneq_s32(a, v, 1); + // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) { + // CHECK: test_vmull_laneq_u16 + return vmull_laneq_u16(a, v, 1); + // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) { + // CHECK: test_vmull_laneq_u32 + return vmull_laneq_u32(a, v, 1); + // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) { + // CHECK: test_vmull_high_laneq_s16 + return vmull_high_laneq_s16(a, v, 1); + // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) { + // CHECK: test_vmull_high_laneq_s32 + return vmull_high_laneq_s32(a, v, 1); + // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) { + // CHECK: test_vmull_high_laneq_u16 + return vmull_high_laneq_u16(a, v, 1); + // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) { + // CHECK: test_vmull_high_laneq_u32 + return vmull_high_laneq_u32(a, v, 1); + // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { + // CHECK: test_vqdmlal_lane_s16 + return vqdmlal_lane_s16(a, b, v, 1); + // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { + // CHECK: test_vqdmlal_lane_s32 + return vqdmlal_lane_s32(a, b, v, 1); + // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { + // CHECK: test_vqdmlal_high_lane_s16 + return vqdmlal_high_lane_s16(a, b, v, 1); + // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { + // CHECK: test_vqdmlal_high_lane_s32 + return vqdmlal_high_lane_s32(a, b, v, 1); + // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { + // CHECK: test_vqdmlsl_lane_s16 + return vqdmlsl_lane_s16(a, b, v, 1); + // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { + // CHECK: test_vqdmlsl_lane_s32 + return vqdmlsl_lane_s32(a, b, v, 1); + // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { + // CHECK: test_vqdmlsl_high_lane_s16 + return vqdmlsl_high_lane_s16(a, b, v, 1); + // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { + // CHECK: test_vqdmlsl_high_lane_s32 + return vqdmlsl_high_lane_s32(a, b, v, 1); + // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) { + // CHECK: test_vqdmull_lane_s16 + return vqdmull_lane_s16(a, v, 1); + // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) { + // CHECK: test_vqdmull_lane_s32 + return vqdmull_lane_s32(a, v, 1); + // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) { + // CHECK: test_vqdmull_laneq_s16 + return vqdmull_laneq_s16(a, v, 1); + // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) { + // CHECK: test_vqdmull_laneq_s32 + return vqdmull_laneq_s32(a, v, 1); + // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) { + // CHECK: test_vqdmull_high_lane_s16 + return vqdmull_high_lane_s16(a, v, 1); + // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) { + // CHECK: test_vqdmull_high_lane_s32 + return vqdmull_high_lane_s32(a, v, 1); + // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) { + // CHECK: test_vqdmull_high_laneq_s16 + return vqdmull_high_laneq_s16(a, v, 1); + // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) { + // CHECK: test_vqdmull_high_laneq_s32 + return vqdmull_high_laneq_s32(a, v, 1); + // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) { + // CHECK: test_vqdmulh_lane_s16 + return vqdmulh_lane_s16(a, v, 1); + // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) { + // CHECK: test_vqdmulhq_lane_s16 + return vqdmulhq_lane_s16(a, v, 1); + // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) { + // CHECK: test_vqdmulh_lane_s32 + return vqdmulh_lane_s32(a, v, 1); + // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) { + // CHECK: test_vqdmulhq_lane_s32 + return vqdmulhq_lane_s32(a, v, 1); + // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) { + // CHECK: test_vqrdmulh_lane_s16 + return vqrdmulh_lane_s16(a, v, 1); + // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) { + // CHECK: test_vqrdmulhq_lane_s16 + return vqrdmulhq_lane_s16(a, v, 1); + // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) { + // CHECK: test_vqrdmulh_lane_s32 + return vqrdmulh_lane_s32(a, v, 1); + // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) { + // CHECK: test_vqrdmulhq_lane_s32 + return vqrdmulhq_lane_s32(a, v, 1); + // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) { + // CHECK: test_vmul_lane_f32 + return vmul_lane_f32(a, v, 1); + // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) { + // CHECK: test_vmulq_lane_f32 + return vmulq_lane_f32(a, v, 1); + // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) { + // CHECK: test_vmulq_lane_f64 + return vmulq_lane_f64(a, v, 0); + // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +} + +float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) { + // CHECK: test_vmul_laneq_f32 + return vmul_laneq_f32(a, v, 1); + // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) { + // CHECK: test_vmulq_laneq_f32 + return vmulq_laneq_f32(a, v, 1); + // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) { + // CHECK: test_vmulq_laneq_f64 + return vmulq_laneq_f64(a, v, 0); + // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +} + +float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) { + // CHECK: test_vmulq_laneq_f64 + return vmulq_laneq_f64(a, v, 1); + // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +} + +float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) { + // CHECK: test_vmulx_lane_f32 + return vmulx_lane_f32(a, v, 1); + // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) { + // CHECK: test_vmulxq_lane_f32 + return vmulxq_lane_f32(a, v, 1); + // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) { + // CHECK: test_vmulxq_lane_f64 + return vmulxq_lane_f64(a, v, 0); + // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +} + +float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) { + // CHECK: test_vmulx_laneq_f32 + return vmulx_laneq_f32(a, v, 1); + // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) { + // CHECK: test_vmulxq_laneq_f32 + return vmulxq_laneq_f32(a, v, 1); + // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) { + // CHECK: test_vmulxq_laneq_f64 + return vmulxq_laneq_f64(a, v, 0); + // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +} + +float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) { + // CHECK: test_vmulxq_laneq_f64 + return vmulxq_laneq_f64(a, v, 1); + // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +} + Index: utils/TableGen/NeonEmitter.cpp =================================================================== --- utils/TableGen/NeonEmitter.cpp +++ utils/TableGen/NeonEmitter.cpp @@ -62,16 +62,45 @@ OpMlalN, OpMlslN, OpMulLane, + OpMulLaneQ, + OpMulXLane, + OpMulXLaneQ, OpMullLane, + OpMullHiLane, + OpMullLaneQ, + OpMullHiLaneQ, OpMlaLane, + OpMlaLaneQ, OpMlsLane, + OpMlsLaneQ, OpMlalLane, + OpMlalLaneQ, + OpMlalHiLane, + OpMlalHiLaneQ, OpMlslLane, + OpMlslLaneQ, + OpMlslHiLane, + OpMlslHiLaneQ, OpQDMullLane, + OpQDMullHiLane, + OpQDMullLaneQ, + OpQDMullHiLaneQ, OpQDMlalLane, + OpQDMlalLaneQ, + OpQDMlalHiLane, + OpQDMlalHiLaneQ, OpQDMlslLane, + OpQDMlslLaneQ, + OpQDMlslHiLane, + OpQDMlslHiLaneQ, OpQDMulhLane, + OpQDMulhLaneQ, OpQRDMulhLane, + OpQRDMulhLaneQ, + OpFMALane, + OpFMALaneQ, + OpFMSLane, + OpFMSLaneQ, OpEq, OpGe, OpLe, @@ -197,16 +226,45 @@ OpMap["OP_MLAL_N"] = OpMlalN; OpMap["OP_MLSL_N"] = OpMlslN; OpMap["OP_MUL_LN"]= OpMulLane; + OpMap["OP_MUL_LNQ"]= OpMulLaneQ; + OpMap["OP_MULX_LN"]= OpMulXLane; + OpMap["OP_MULX_LNQ"]= OpMulXLaneQ; OpMap["OP_MULL_LN"] = OpMullLane; + OpMap["OP_MULLHi_LN"] = OpMullHiLane; + OpMap["OP_MULL_LNQ"] = OpMullLaneQ; + OpMap["OP_MULLHi_LNQ"] = OpMullHiLaneQ; OpMap["OP_MLA_LN"]= OpMlaLane; + OpMap["OP_MLA_LNQ"]= OpMlaLaneQ; OpMap["OP_MLS_LN"]= OpMlsLane; + OpMap["OP_MLS_LNQ"]= OpMlsLaneQ; OpMap["OP_MLAL_LN"] = OpMlalLane; + OpMap["OP_MLAL_LNQ"] = OpMlalLaneQ; + OpMap["OP_MLALHi_LN"] = OpMlalHiLane; + OpMap["OP_MLALHi_LNQ"] = OpMlalHiLaneQ; OpMap["OP_MLSL_LN"] = OpMlslLane; + OpMap["OP_MLSL_LNQ"] = OpMlslLaneQ; + OpMap["OP_MLSLHi_LN"] = OpMlslHiLane; + OpMap["OP_MLSLHi_LNQ"] = OpMlslHiLaneQ; OpMap["OP_QDMULL_LN"] = OpQDMullLane; + OpMap["OP_QDMULLHi_LN"] = OpQDMullHiLane; + OpMap["OP_QDMULL_LNQ"] = OpQDMullLaneQ; + OpMap["OP_QDMULLHi_LNQ"] = OpQDMullHiLaneQ; OpMap["OP_QDMLAL_LN"] = OpQDMlalLane; + OpMap["OP_QDMLAL_LNQ"] = OpQDMlalLaneQ; + OpMap["OP_QDMLALHi_LN"] = OpQDMlalHiLane; + OpMap["OP_QDMLALHi_LNQ"] = OpQDMlalHiLaneQ; OpMap["OP_QDMLSL_LN"] = OpQDMlslLane; + OpMap["OP_QDMLSL_LNQ"] = OpQDMlslLaneQ; + OpMap["OP_QDMLSLHi_LN"] = OpQDMlslHiLane; + OpMap["OP_QDMLSLHi_LNQ"] = OpQDMlslHiLaneQ; OpMap["OP_QDMULH_LN"] = OpQDMulhLane; + OpMap["OP_QDMULH_LNQ"] = OpQDMulhLaneQ; OpMap["OP_QRDMULH_LN"] = OpQRDMulhLane; + OpMap["OP_QRDMULH_LNQ"] = OpQRDMulhLaneQ; + OpMap["OP_FMA_LN"] = OpFMALane; + OpMap["OP_FMA_LNQ"] = OpFMALaneQ; + OpMap["OP_FMS_LN"] = OpFMSLane; + OpMap["OP_FMS_LNQ"] = OpFMSLaneQ; OpMap["OP_EQ"] = OpEq; OpMap["OP_GE"] = OpGe; OpMap["OP_LE"] = OpLe; @@ -447,6 +505,9 @@ case 'g': quad = false; break; + case 'j': + quad = true; + break; case 'w': type = Widen(type); quad = true; @@ -626,7 +687,8 @@ type = 's'; usgn = true; } - usgn = usgn | poly | ((ck == ClassI || ck == ClassW) && scal && type != 'f'); + usgn = usgn | poly | ((ck == ClassI || ck == ClassW) && + scal && type != 'f' && type != 'd'); if (scal) { SmallString<128> s; @@ -657,6 +719,8 @@ return "vv*"; // void result with void* first argument if (mod == 'f' || (ck != ClassB && type == 'f')) return quad ? "V4f" : "V2f"; + if (ck != ClassB && type == 'd') + return quad ? "V2d" : "V1d"; if (ck != ClassB && type == 's') return quad ? "V8s" : "V4s"; if (ck != ClassB && type == 'i') @@ -677,6 +741,8 @@ if (mod == 'f' || (ck != ClassB && type == 'f')) return quad ? "V4f" : "V2f"; + if (ck != ClassB && type == 'd') + return quad ? "V2d" : "V1d"; if (ck != ClassB && type == 's') return quad ? "V8s" : "V4s"; if (ck != ClassB && type == 'i') @@ -974,6 +1040,7 @@ NormedProto += 'q'; break; case 'g': + case 'j': case 'h': case 'e': NormedProto += 'd'; @@ -1502,19 +1569,34 @@ s += "__a * " + Duplicate(nElts, typestr, "__b") + ";"; break; case OpMulLane: + case OpMulLaneQ: s += "__a * " + SplatLane(nElts, "__b", "__c") + ";"; break; + case OpMulXLane: + case OpMulXLaneQ: + s += MangleName("vmulx", typestr, ClassS) + "(__a, " + + SplatLane(nElts, "__b", "__c") + ");"; + break; case OpMul: s += "__a * __b;"; break; case OpMullLane: + case OpMullLaneQ: s += MangleName("vmull", typestr, ClassS) + "(__a, " + SplatLane(nElts, "__b", "__c") + ");"; break; + case OpMullHiLane: + case OpMullHiLaneQ: + s += MangleName("vmull", typestr, ClassS) + "(" + + GetHigh("__a", typestr) + ", " + SplatLane(nElts, "__b", "__c") + ");"; + break; case OpMlaN: s += "__a + (__b * " + Duplicate(nElts, typestr, "__c") + ");"; break; case OpMlaLane: + case OpMlaLaneQ: + case OpFMALane: + case OpFMALaneQ: s += "__a + (__b * " + SplatLane(nElts, "__c", "__d") + ");"; break; case OpMla: @@ -1525,9 +1607,15 @@ Duplicate(nElts, typestr, "__c") + ");"; break; case OpMlalLane: + case OpMlalLaneQ: s += "__a + " + MangleName("vmull", typestr, ClassS) + "(__b, " + SplatLane(nElts, "__c", "__d") + ");"; break; + case OpMlalHiLane: + case OpMlalHiLaneQ: + s += "__a + " + MangleName("vmull", typestr, ClassS) + "(" + + GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");"; + break; case OpMlal: s += "__a + " + MangleName("vmull", typestr, ClassS) + "(__b, __c);"; break; @@ -1541,6 +1629,9 @@ s += "__a - (__b * " + Duplicate(nElts, typestr, "__c") + ");"; break; case OpMlsLane: + case OpMlsLaneQ: + case OpFMSLane: + case OpFMSLaneQ: s += "__a - (__b * " + SplatLane(nElts, "__c", "__d") + ");"; break; case OpMls: @@ -1551,9 +1642,15 @@ Duplicate(nElts, typestr, "__c") + ");"; break; case OpMlslLane: + case OpMlslLaneQ: s += "__a - " + MangleName("vmull", typestr, ClassS) + "(__b, " + SplatLane(nElts, "__c", "__d") + ");"; break; + case OpMlslHiLane: + case OpMlslHiLaneQ: + s += "__a - " + MangleName("vmull", typestr, ClassS) + "(" + + GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");"; + break; case OpMlsl: s += "__a - " + MangleName("vmull", typestr, ClassS) + "(__b, __c);"; break; @@ -1561,22 +1658,42 @@ s += Gen3OpWith2High(typestr, "vmlsl", "__a", "__b", "__c"); break; case OpQDMullLane: + case OpQDMullLaneQ: s += MangleName("vqdmull", typestr, ClassS) + "(__a, " + SplatLane(nElts, "__b", "__c") + ");"; break; + case OpQDMullHiLane: + case OpQDMullHiLaneQ: + s += MangleName("vqdmull", typestr, ClassS) + "(" + + GetHigh("__a", typestr) + ", " + SplatLane(nElts, "__b", "__c") + ");"; + break; case OpQDMlalLane: + case OpQDMlalLaneQ: s += MangleName("vqdmlal", typestr, ClassS) + "(__a, __b, " + SplatLane(nElts, "__c", "__d") + ");"; break; + case OpQDMlalHiLane: + case OpQDMlalHiLaneQ: + s += MangleName("vqdmlal", typestr, ClassS) + "(__a, " + + GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");"; + break; case OpQDMlslLane: + case OpQDMlslLaneQ: s += MangleName("vqdmlsl", typestr, ClassS) + "(__a, __b, " + SplatLane(nElts, "__c", "__d") + ");"; break; + case OpQDMlslHiLane: + case OpQDMlslHiLaneQ: + s += MangleName("vqdmlsl", typestr, ClassS) + "(__a, " + + GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");"; + break; case OpQDMulhLane: + case OpQDMulhLaneQ: s += MangleName("vqdmulh", typestr, ClassS) + "(__a, " + SplatLane(nElts, "__b", "__c") + ");"; break; case OpQRDMulhLane: + case OpQRDMulhLaneQ: s += MangleName("vqrdmulh", typestr, ClassS) + "(__a, " + SplatLane(nElts, "__b", "__c") + ");"; break; @@ -2072,20 +2189,28 @@ // Emit Neon vector typedefs. std::string TypedefTypes( - "cQcsQsiQilQlUcQUcUsQUsUiQUiUlQUlhQhfQfQdPcQPcPsQPs"); + "cQcsQsiQilQlUcQUcUsQUsUiQUiUlQUlhQhfQfdQdPcQPcPsQPs"); SmallVector TDTypeVec; ParseTypes(0, TypedefTypes, TDTypeVec); // Emit vector typedefs. + bool isA64 = false; for (unsigned i = 0, e = TDTypeVec.size(); i != e; ++i) { bool dummy, quad = false, poly = false; char type = ClassifyType(TDTypeVec[i], quad, poly, dummy); - bool isA64 = false; + bool preinsert = false; + bool postinsert = false; - if (type == 'd' && quad) + if (type == 'd') { + preinsert = isA64? false: true; isA64 = true; - - if (isA64) + } else { + postinsert = isA64? true: false; + isA64 = false; + } + if (postinsert) + OS << "#endif\n"; + if (preinsert) OS << "#ifdef __aarch64__\n"; if (poly) @@ -2101,22 +2226,28 @@ OS << TypeString('s', TDTypeVec[i]); OS << " " << TypeString('d', TDTypeVec[i]) << ";\n"; - if (isA64) - OS << "#endif\n"; } OS << "\n"; // Emit struct typedefs. + isA64 = false; for (unsigned vi = 2; vi != 5; ++vi) { for (unsigned i = 0, e = TDTypeVec.size(); i != e; ++i) { bool dummy, quad = false, poly = false; char type = ClassifyType(TDTypeVec[i], quad, poly, dummy); - bool isA64 = false; + bool preinsert = false; + bool postinsert = false; - if (type == 'd' && quad) + if (type == 'd') { + preinsert = isA64? false: true; isA64 = true; - - if (isA64) + } else { + postinsert = isA64? true: false; + isA64 = false; + } + if (postinsert) + OS << "#endif\n"; + if (preinsert) OS << "#ifdef __aarch64__\n"; std::string ts = TypeString('d', TDTypeVec[i]); @@ -2126,10 +2257,6 @@ OS << "[" << utostr(vi) << "]"; OS << ";\n} "; OS << vs << ";\n"; - - if (isA64) - OS << "#endif\n"; - OS << "\n"; } } @@ -2255,6 +2382,7 @@ case 'f': case 'i': return (2 << (int)quad) - 1; + case 'd': case 'l': return (1 << (int)quad) - 1; default: