Index: include/clang/Basic/arm_neon.td =================================================================== --- include/clang/Basic/arm_neon.td +++ include/clang/Basic/arm_neon.td @@ -40,16 +40,25 @@ def OP_MLAL_N : Op; def OP_MLSL_N : Op; def OP_MUL_LN: Op; +def OP_MULX_LN: Op; def OP_MULL_LN : Op; +def OP_MULLHi_LN : Op; def OP_MLA_LN: Op; def OP_MLS_LN: Op; def OP_MLAL_LN : Op; +def OP_MLALHi_LN : Op; def OP_MLSL_LN : Op; +def OP_MLSLHi_LN : Op; def OP_QDMULL_LN : Op; +def OP_QDMULLHi_LN : Op; def OP_QDMLAL_LN : Op; +def OP_QDMLALHi_LN : Op; def OP_QDMLSL_LN : Op; +def OP_QDMLSLHi_LN : Op; def OP_QDMULH_LN : Op; def OP_QRDMULH_LN : Op; +def OP_FMA_LN : Op; +def OP_FMS_LN : Op; def OP_EQ : Op; def OP_GE : Op; def OP_LE : Op; @@ -146,6 +155,7 @@ // f: float (int args) // d: default // g: default, ignore 'Q' size modifier. +// j: default, force 'Q' size modifier. // w: double width elements, same num elts // n: double width elements, half num elts // h: half width elements, double num elts @@ -503,7 +513,7 @@ //////////////////////////////////////////////////////////////////////////////// // Multiplication Extended -def MULX : SInst<"vmulx", "ddd", "fQfQd">; +def MULX : SInst<"vmulx", "ddd", "fdQfQd">; //////////////////////////////////////////////////////////////////////////////// // Division @@ -630,6 +640,63 @@ def VQDMLSL_HIGH : SOpInst<"vqdmlsl_high", "wwkk", "si", OP_QDMLSLHi>; //////////////////////////////////////////////////////////////////////////////// + +def VMLA_LANEQ : IOpInst<"vmla_laneq", "dddji", + "siUsUifQsQiQUsQUiQf", OP_MLA_LN>; +def VMLS_LANEQ : IOpInst<"vmls_laneq", "dddji", + "siUsUifQsQiQUsQUiQf", OP_MLS_LN>; + +def VFMA_LANE : IOpInst<"vfma_lane", "dddgi", "fdQfQd", OP_FMA_LN>; +def VFMA_LANEQ : IOpInst<"vfma_laneq", "dddji", "fdQfQd", OP_FMA_LN>; +def VFMS_LANE : IOpInst<"vfms_lane", "dddgi", "fdQfQd", OP_FMS_LN>; +def VFMS_LANEQ : IOpInst<"vfms_laneq", "dddji", "fdQfQd", OP_FMS_LN>; + +def VMLAL_LANEQ : SOpInst<"vmlal_laneq", "wwdki", "siUsUi", OP_MLAL_LN>; +def VMLAL_HIGH_LANE : SOpInst<"vmlal_high_lane", "wwkdi", "siUsUi", + OP_MLALHi_LN>; +def VMLAL_HIGH_LANEQ : SOpInst<"vmlal_high_laneq", "wwkki", "siUsUi", + OP_MLALHi_LN>; +def VMLSL_LANEQ : SOpInst<"vmlsl_laneq", "wwdki", "siUsUi", OP_MLSL_LN>; +def VMLSL_HIGH_LANE : SOpInst<"vmlsl_high_lane", "wwkdi", "siUsUi", + OP_MLSLHi_LN>; +def VMLSL_HIGH_LANEQ : SOpInst<"vmlsl_high_laneq", "wwkki", "siUsUi", + OP_MLSLHi_LN>; + +def VQDMLAL_LANEQ : SOpInst<"vqdmlal_laneq", "wwdki", "si", OP_QDMLAL_LN>; +def VQDMLAL_HIGH_LANE : SOpInst<"vqdmlal_high_lane", "wwkdi", "si", + OP_QDMLALHi_LN>; +def VQDMLAL_HIGH_LANEQ : SOpInst<"vqdmlal_high_laneq", "wwkki", "si", + OP_QDMLALHi_LN>; +def VQDMLSL_LANEQ : SOpInst<"vqdmlsl_laneq", "wwdki", "si", OP_QDMLSL_LN>; +def VQDMLSL_HIGH_LANE : SOpInst<"vqdmlsl_high_lane", "wwkdi", "si", + OP_QDMLSLHi_LN>; +def VQDMLSL_HIGH_LANEQ : SOpInst<"vqdmlsl_high_laneq", "wwkki", "si", + OP_QDMLSLHi_LN>; + +// Newly add double parameter for vmul_lane in aarch64 +def VMUL_LANE_A64 : IOpInst<"vmul_lane", "ddgi", "dQd", OP_MUL_LN>; + +def VMUL_LANEQ : IOpInst<"vmul_laneq", "ddji", + "sifdUsUiQsQiQfQUsQUiQfQd", OP_MUL_LN>; +def VMULL_LANEQ : SOpInst<"vmull_laneq", "wdki", "siUsUi", OP_MULL_LN>; +def VMULL_HIGH_LANE : SOpInst<"vmull_high_lane", "wkdi", "siUsUi", + OP_MULLHi_LN>; +def VMULL_HIGH_LANEQ : SOpInst<"vmull_high_laneq", "wkki", "siUsUi", + OP_MULLHi_LN>; + +def VQDMULL_LANEQ : SOpInst<"vqdmull_laneq", "wdki", "si", OP_QDMULL_LN>; +def VQDMULL_HIGH_LANE : SOpInst<"vqdmull_high_lane", "wkdi", "si", + OP_QDMULLHi_LN>; +def VQDMULL_HIGH_LANEQ : SOpInst<"vqdmull_high_laneq", "wkki", "si", + OP_QDMULLHi_LN>; + +def VQDMULH_LANEQ : SOpInst<"vqdmulh_laneq", "ddji", "siQsQi", OP_QDMULH_LN>; +def VQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "ddji", "siQsQi", OP_QRDMULH_LN>; + +def VMULX_LANE : IOpInst<"vmulx_lane", "ddgi", "fdQfQd", OP_MULX_LN>; +def VMULX_LANEQ : IOpInst<"vmulx_laneq", "ddji", "fdQfQd", OP_MULX_LN>; + +//////////////////////////////////////////////////////////////////////////////// // Scalar Arithmetic // Scalar Addition Index: test/CodeGen/aarch64-neon-2velem.c =================================================================== --- /dev/null +++ test/CodeGen/aarch64-neon-2velem.c @@ -0,0 +1,800 @@ +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \ +// RUN: -ffp-contract=fast -S -O3 -o - %s | FileCheck %s + +// Test new aarch64 intrinsics and types + +#include + +int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) { + // CHECK: test_vmla_lane_s16 + return vmla_lane_s16(a, b, v, 1); + // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) { + // CHECK: test_vmlaq_lane_s16 + return vmlaq_lane_s16(a, b, v, 1); + // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) { + // CHECK: test_vmla_lane_s32 + return vmla_lane_s32(a, b, v, 1); + // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) { + // CHECK: test_vmlaq_lane_s32 + return vmlaq_lane_s32(a, b, v, 1); + // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { + // CHECK: test_vmla_laneq_s16 + return vmla_laneq_s16(a, b, v, 1); + // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { + // CHECK: test_vmlaq_laneq_s16 + return vmlaq_laneq_s16(a, b, v, 1); + // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { + // CHECK: test_vmla_laneq_s32 + return vmla_laneq_s32(a, b, v, 1); + // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) { + // CHECK: test_vmlaq_laneq_s32 + return vmlaq_laneq_s32(a, b, v, 1); + // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) { + // CHECK: test_vmls_lane_s16 + return vmls_lane_s16(a, b, v, 1); + // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) { + // CHECK: test_vmlsq_lane_s16 + return vmlsq_lane_s16(a, b, v, 1); + // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) { + // CHECK: test_vmls_lane_s32 + return vmls_lane_s32(a, b, v, 1); + // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) { + // CHECK: test_vmlsq_lane_s32 + return vmlsq_lane_s32(a, b, v, 1); + // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { + // CHECK: test_vmls_laneq_s16 + return vmls_laneq_s16(a, b, v, 1); + // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { + // CHECK: test_vmlsq_laneq_s16 + return vmlsq_laneq_s16(a, b, v, 1); + // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { + // CHECK: test_vmls_laneq_s32 + return vmls_laneq_s32(a, b, v, 1); + // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) { + // CHECK: test_vmlsq_laneq_s32 + return vmlsq_laneq_s32(a, b, v, 1); + // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) { + // CHECK: test_vmul_lane_s16 + return vmul_lane_s16(a, v, 1); + // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) { + // CHECK: test_vmulq_lane_s16 + return vmulq_lane_s16(a, v, 1); + // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) { + // CHECK: test_vmul_lane_s32 + return vmul_lane_s32(a, v, 1); + // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) { + // CHECK: test_vmulq_lane_s32 + return vmulq_lane_s32(a, v, 1); + // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) { + // CHECK: test_vmul_lane_u16 + return vmul_lane_u16(a, v, 1); + // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) { + // CHECK: test_vmulq_lane_u16 + return vmulq_lane_u16(a, v, 1); + // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) { + // CHECK: test_vmul_lane_u32 + return vmul_lane_u32(a, v, 1); + // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) { + // CHECK: test_vmulq_lane_u32 + return vmulq_lane_u32(a, v, 1); + // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) { + // CHECK: test_vmul_laneq_s16 + return vmul_laneq_s16(a, v, 1); + // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) { + // CHECK: test_vmulq_laneq_s16 + return vmulq_laneq_s16(a, v, 1); + // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) { + // CHECK: test_vmul_laneq_s32 + return vmul_laneq_s32(a, v, 1); + // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) { + // CHECK: test_vmulq_laneq_s32 + return vmulq_laneq_s32(a, v, 1); + // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) { + // CHECK: test_vmul_laneq_u16 + return vmul_laneq_u16(a, v, 1); + // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) { + // CHECK: test_vmulq_laneq_u16 + return vmulq_laneq_u16(a, v, 1); + // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) { + // CHECK: test_vmul_laneq_u32 + return vmul_laneq_u32(a, v, 1); + // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) { + // CHECK: test_vmulq_laneq_u32 + return vmulq_laneq_u32(a, v, 1); + // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { + // CHECK: test_vfma_lane_f32 + return vfma_lane_f32(a, b, v, 1); + // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { + // CHECK: test_vfmaq_lane_f32 + return vfmaq_lane_f32(a, b, v, 1); + // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { + // CHECK: test_vfma_laneq_f32 + return vfma_laneq_f32(a, b, v, 1); + // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { + // CHECK: test_vfmaq_laneq_f32 + return vfmaq_laneq_f32(a, b, v, 1); + // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { + // CHECK: test_vfms_lane_f32 + return vfms_lane_f32(a, b, v, 1); + // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { + // CHECK: test_vfmsq_lane_f32 + return vfmsq_lane_f32(a, b, v, 1); + // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { + // CHECK: test_vfms_laneq_f32 + return vfms_laneq_f32(a, b, v, 1); + // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { + // CHECK: test_vfmsq_laneq_f32 + return vfmsq_laneq_f32(a, b, v, 1); + // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) { + // CHECK: test_vfmaq_lane_f64 + return vfmaq_lane_f64(a, b, v, 0); + // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +} + +float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) { + // CHECK: test_vfmaq_laneq_f64 + return vfmaq_laneq_f64(a, b, v, 0); + // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +} + +float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) { + // CHECK: test_vfmaq_laneq_f64 + return vfmaq_laneq_f64(a, b, v, 1); + // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +} + +float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) { + // CHECK: test_vfmsq_lane_f64 + return vfmsq_lane_f64(a, b, v, 0); + // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +} + +float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) { + // CHECK: test_vfmsq_laneq_f64 + return vfmsq_laneq_f64(a, b, v, 0); + // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +} + +float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) { + // CHECK: test_vfmsq_laneq_f64 + return vfmsq_laneq_f64(a, b, v, 1); + // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +} + +int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { + // CHECK: test_vmlal_lane_s16 + return vmlal_lane_s16(a, b, v, 1); + // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { + // CHECK: test_vmlal_lane_s32 + return vmlal_lane_s32(a, b, v, 1); + // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { + // CHECK: test_vmlal_laneq_s16 + return vmlal_laneq_s16(a, b, v, 1); + // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { + // CHECK: test_vmlal_laneq_s32 + return vmlal_laneq_s32(a, b, v, 1); + // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { + // CHECK: test_vmlal_high_lane_s16 + return vmlal_high_lane_s16(a, b, v, 1); + // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { + // CHECK: test_vmlal_high_lane_s32 + return vmlal_high_lane_s32(a, b, v, 1); + // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { + // CHECK: test_vmlal_high_laneq_s16 + return vmlal_high_laneq_s16(a, b, v, 1); + // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { + // CHECK: test_vmlal_high_laneq_s32 + return vmlal_high_laneq_s32(a, b, v, 1); + // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { + // CHECK: test_vmlsl_lane_s16 + return vmlsl_lane_s16(a, b, v, 1); + // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { + // CHECK: test_vmlsl_lane_s32 + return vmlsl_lane_s32(a, b, v, 1); + // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { + // CHECK: test_vmlsl_laneq_s16 + return vmlsl_laneq_s16(a, b, v, 1); + // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { + // CHECK: test_vmlsl_laneq_s32 + return vmlsl_laneq_s32(a, b, v, 1); + // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { + // CHECK: test_vmlsl_high_lane_s16 + return vmlsl_high_lane_s16(a, b, v, 1); + // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { + // CHECK: test_vmlsl_high_lane_s32 + return vmlsl_high_lane_s32(a, b, v, 1); + // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { + // CHECK: test_vmlsl_high_laneq_s16 + return vmlsl_high_laneq_s16(a, b, v, 1); + // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { + // CHECK: test_vmlsl_high_laneq_s32 + return vmlsl_high_laneq_s32(a, b, v, 1); + // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) { + // CHECK: test_vmlal_lane_u16 + return vmlal_lane_u16(a, b, v, 1); + // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) { + // CHECK: test_vmlal_lane_u32 + return vmlal_lane_u32(a, b, v, 1); + // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) { + // CHECK: test_vmlal_laneq_u16 + return vmlal_laneq_u16(a, b, v, 1); + // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) { + // CHECK: test_vmlal_laneq_u32 + return vmlal_laneq_u32(a, b, v, 1); + // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) { + // CHECK: test_vmlal_high_lane_u16 + return vmlal_high_lane_u16(a, b, v, 1); + // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) { + // CHECK: test_vmlal_high_lane_u32 + return vmlal_high_lane_u32(a, b, v, 1); + // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) { + // CHECK: test_vmlal_high_laneq_u16 + return vmlal_high_laneq_u16(a, b, v, 1); + // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) { + // CHECK: test_vmlal_high_laneq_u32 + return vmlal_high_laneq_u32(a, b, v, 1); + // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) { + // CHECK: test_vmlsl_lane_u16 + return vmlsl_lane_u16(a, b, v, 1); + // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) { + // CHECK: test_vmlsl_lane_u32 + return vmlsl_lane_u32(a, b, v, 1); + // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) { + // CHECK: test_vmlsl_laneq_u16 + return vmlsl_laneq_u16(a, b, v, 1); + // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) { + // CHECK: test_vmlsl_laneq_u32 + return vmlsl_laneq_u32(a, b, v, 1); + // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) { + // CHECK: test_vmlsl_high_lane_u16 + return vmlsl_high_lane_u16(a, b, v, 1); + // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) { + // CHECK: test_vmlsl_high_lane_u32 + return vmlsl_high_lane_u32(a, b, v, 1); + // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) { + // CHECK: test_vmlsl_high_laneq_u16 + return vmlsl_high_laneq_u16(a, b, v, 1); + // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) { + // CHECK: test_vmlsl_high_laneq_u32 + return vmlsl_high_laneq_u32(a, b, v, 1); + // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) { + // CHECK: test_vmull_lane_s16 + return vmull_lane_s16(a, v, 1); + // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) { + // CHECK: test_vmull_lane_s32 + return vmull_lane_s32(a, v, 1); + // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) { + // CHECK: test_vmull_lane_u16 + return vmull_lane_u16(a, v, 1); + // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) { + // CHECK: test_vmull_lane_u32 + return vmull_lane_u32(a, v, 1); + // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) { + // CHECK: test_vmull_high_lane_s16 + return vmull_high_lane_s16(a, v, 1); + // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) { + // CHECK: test_vmull_high_lane_s32 + return vmull_high_lane_s32(a, v, 1); + // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) { + // CHECK: test_vmull_high_lane_u16 + return vmull_high_lane_u16(a, v, 1); + // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) { + // CHECK: test_vmull_high_lane_u32 + return vmull_high_lane_u32(a, v, 1); + // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) { + // CHECK: test_vmull_laneq_s16 + return vmull_laneq_s16(a, v, 1); + // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) { + // CHECK: test_vmull_laneq_s32 + return vmull_laneq_s32(a, v, 1); + // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) { + // CHECK: test_vmull_laneq_u16 + return vmull_laneq_u16(a, v, 1); + // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) { + // CHECK: test_vmull_laneq_u32 + return vmull_laneq_u32(a, v, 1); + // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) { + // CHECK: test_vmull_high_laneq_s16 + return vmull_high_laneq_s16(a, v, 1); + // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) { + // CHECK: test_vmull_high_laneq_s32 + return vmull_high_laneq_s32(a, v, 1); + // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) { + // CHECK: test_vmull_high_laneq_u16 + return vmull_high_laneq_u16(a, v, 1); + // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) { + // CHECK: test_vmull_high_laneq_u32 + return vmull_high_laneq_u32(a, v, 1); + // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { + // CHECK: test_vqdmlal_lane_s16 + return vqdmlal_lane_s16(a, b, v, 1); + // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { + // CHECK: test_vqdmlal_lane_s32 + return vqdmlal_lane_s32(a, b, v, 1); + // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { + // CHECK: test_vqdmlal_high_lane_s16 + return vqdmlal_high_lane_s16(a, b, v, 1); + // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { + // CHECK: test_vqdmlal_high_lane_s32 + return vqdmlal_high_lane_s32(a, b, v, 1); + // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { + // CHECK: test_vqdmlsl_lane_s16 + return vqdmlsl_lane_s16(a, b, v, 1); + // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { + // CHECK: test_vqdmlsl_lane_s32 + return vqdmlsl_lane_s32(a, b, v, 1); + // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { + // CHECK: test_vqdmlsl_high_lane_s16 + return vqdmlsl_high_lane_s16(a, b, v, 1); + // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { + // CHECK: test_vqdmlsl_high_lane_s32 + return vqdmlsl_high_lane_s32(a, b, v, 1); + // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) { + // CHECK: test_vqdmull_lane_s16 + return vqdmull_lane_s16(a, v, 1); + // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) { + // CHECK: test_vqdmull_lane_s32 + return vqdmull_lane_s32(a, v, 1); + // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) { + // CHECK: test_vqdmull_laneq_s16 + return vqdmull_laneq_s16(a, v, 1); + // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) { + // CHECK: test_vqdmull_laneq_s32 + return vqdmull_laneq_s32(a, v, 1); + // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) { + // CHECK: test_vqdmull_high_lane_s16 + return vqdmull_high_lane_s16(a, v, 1); + // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) { + // CHECK: test_vqdmull_high_lane_s32 + return vqdmull_high_lane_s32(a, v, 1); + // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) { + // CHECK: test_vqdmull_high_laneq_s16 + return vqdmull_high_laneq_s16(a, v, 1); + // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) { + // CHECK: test_vqdmull_high_laneq_s32 + return vqdmull_high_laneq_s32(a, v, 1); + // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) { + // CHECK: test_vqdmulh_lane_s16 + return vqdmulh_lane_s16(a, v, 1); + // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) { + // CHECK: test_vqdmulhq_lane_s16 + return vqdmulhq_lane_s16(a, v, 1); + // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) { + // CHECK: test_vqdmulh_lane_s32 + return vqdmulh_lane_s32(a, v, 1); + // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) { + // CHECK: test_vqdmulhq_lane_s32 + return vqdmulhq_lane_s32(a, v, 1); + // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) { + // CHECK: test_vqrdmulh_lane_s16 + return vqrdmulh_lane_s16(a, v, 1); + // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) { + // CHECK: test_vqrdmulhq_lane_s16 + return vqrdmulhq_lane_s16(a, v, 1); + // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) { + // CHECK: test_vqrdmulh_lane_s32 + return vqrdmulh_lane_s32(a, v, 1); + // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) { + // CHECK: test_vqrdmulhq_lane_s32 + return vqrdmulhq_lane_s32(a, v, 1); + // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) { + // CHECK: test_vmul_lane_f32 + return vmul_lane_f32(a, v, 1); + // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) { + // CHECK: test_vmulq_lane_f32 + return vmulq_lane_f32(a, v, 1); + // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) { + // CHECK: test_vmulq_lane_f64 + return vmulq_lane_f64(a, v, 0); + // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +} + +float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) { + // CHECK: test_vmul_laneq_f32 + return vmul_laneq_f32(a, v, 1); + // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) { + // CHECK: test_vmulq_laneq_f32 + return vmulq_laneq_f32(a, v, 1); + // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) { + // CHECK: test_vmulq_laneq_f64 + return vmulq_laneq_f64(a, v, 0); + // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +} + +float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) { + // CHECK: test_vmulq_laneq_f64 + return vmulq_laneq_f64(a, v, 1); + // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +} + +float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) { + // CHECK: test_vmulx_lane_f32 + return vmulx_lane_f32(a, v, 1); + // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) { + // CHECK: test_vmulxq_lane_f32 + return vmulxq_lane_f32(a, v, 1); + // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) { + // CHECK: test_vmulxq_lane_f64 + return vmulxq_lane_f64(a, v, 0); + // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +} + +float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) { + // CHECK: test_vmulx_laneq_f32 + return vmulx_laneq_f32(a, v, 1); + // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) { + // CHECK: test_vmulxq_laneq_f32 + return vmulxq_laneq_f32(a, v, 1); + // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) { + // CHECK: test_vmulxq_laneq_f64 + return vmulxq_laneq_f64(a, v, 0); + // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +} + +float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) { + // CHECK: test_vmulxq_laneq_f64 + return vmulxq_laneq_f64(a, v, 1); + // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +} + Index: utils/TableGen/NeonEmitter.cpp =================================================================== --- utils/TableGen/NeonEmitter.cpp +++ utils/TableGen/NeonEmitter.cpp @@ -62,16 +62,25 @@ OpMlalN, OpMlslN, OpMulLane, + OpMulXLane, OpMullLane, + OpMullHiLane, OpMlaLane, OpMlsLane, OpMlalLane, + OpMlalHiLane, OpMlslLane, + OpMlslHiLane, OpQDMullLane, + OpQDMullHiLane, OpQDMlalLane, + OpQDMlalHiLane, OpQDMlslLane, + OpQDMlslHiLane, OpQDMulhLane, OpQRDMulhLane, + OpFMALane, + OpFMSLane, OpEq, OpGe, OpLe, @@ -197,16 +206,25 @@ OpMap["OP_MLAL_N"] = OpMlalN; OpMap["OP_MLSL_N"] = OpMlslN; OpMap["OP_MUL_LN"]= OpMulLane; + OpMap["OP_MULX_LN"]= OpMulXLane; OpMap["OP_MULL_LN"] = OpMullLane; + OpMap["OP_MULLHi_LN"] = OpMullHiLane; OpMap["OP_MLA_LN"]= OpMlaLane; OpMap["OP_MLS_LN"]= OpMlsLane; OpMap["OP_MLAL_LN"] = OpMlalLane; + OpMap["OP_MLALHi_LN"] = OpMlalHiLane; OpMap["OP_MLSL_LN"] = OpMlslLane; + OpMap["OP_MLSLHi_LN"] = OpMlslHiLane; OpMap["OP_QDMULL_LN"] = OpQDMullLane; + OpMap["OP_QDMULLHi_LN"] = OpQDMullHiLane; OpMap["OP_QDMLAL_LN"] = OpQDMlalLane; + OpMap["OP_QDMLALHi_LN"] = OpQDMlalHiLane; OpMap["OP_QDMLSL_LN"] = OpQDMlslLane; + OpMap["OP_QDMLSLHi_LN"] = OpQDMlslHiLane; OpMap["OP_QDMULH_LN"] = OpQDMulhLane; OpMap["OP_QRDMULH_LN"] = OpQRDMulhLane; + OpMap["OP_FMA_LN"] = OpFMALane; + OpMap["OP_FMS_LN"] = OpFMSLane; OpMap["OP_EQ"] = OpEq; OpMap["OP_GE"] = OpGe; OpMap["OP_LE"] = OpLe; @@ -447,6 +465,9 @@ case 'g': quad = false; break; + case 'j': + quad = true; + break; case 'w': type = Widen(type); quad = true; @@ -626,7 +647,8 @@ type = 's'; usgn = true; } - usgn = usgn | poly | ((ck == ClassI || ck == ClassW) && scal && type != 'f'); + usgn = usgn | poly | ((ck == ClassI || ck == ClassW) && + scal && type != 'f' && type != 'd'); if (scal) { SmallString<128> s; @@ -657,6 +679,8 @@ return "vv*"; // void result with void* first argument if (mod == 'f' || (ck != ClassB && type == 'f')) return quad ? "V4f" : "V2f"; + if (ck != ClassB && type == 'd') + return quad ? "V2d" : "V1d"; if (ck != ClassB && type == 's') return quad ? "V8s" : "V4s"; if (ck != ClassB && type == 'i') @@ -677,6 +701,8 @@ if (mod == 'f' || (ck != ClassB && type == 'f')) return quad ? "V4f" : "V2f"; + if (ck != ClassB && type == 'd') + return quad ? "V2d" : "V1d"; if (ck != ClassB && type == 's') return quad ? "V8s" : "V4s"; if (ck != ClassB && type == 'i') @@ -974,6 +1000,7 @@ NormedProto += 'q'; break; case 'g': + case 'j': case 'h': case 'e': NormedProto += 'd'; @@ -1504,6 +1531,10 @@ case OpMulLane: s += "__a * " + SplatLane(nElts, "__b", "__c") + ";"; break; + case OpMulXLane: + s += MangleName("vmulx", typestr, ClassS) + "(__a, " + + SplatLane(nElts, "__b", "__c") + ");"; + break; case OpMul: s += "__a * __b;"; break; @@ -1511,10 +1542,15 @@ s += MangleName("vmull", typestr, ClassS) + "(__a, " + SplatLane(nElts, "__b", "__c") + ");"; break; + case OpMullHiLane: + s += MangleName("vmull", typestr, ClassS) + "(" + + GetHigh("__a", typestr) + ", " + SplatLane(nElts, "__b", "__c") + ");"; + break; case OpMlaN: s += "__a + (__b * " + Duplicate(nElts, typestr, "__c") + ");"; break; case OpMlaLane: + case OpFMALane: s += "__a + (__b * " + SplatLane(nElts, "__c", "__d") + ");"; break; case OpMla: @@ -1528,6 +1564,10 @@ s += "__a + " + MangleName("vmull", typestr, ClassS) + "(__b, " + SplatLane(nElts, "__c", "__d") + ");"; break; + case OpMlalHiLane: + s += "__a + " + MangleName("vmull", typestr, ClassS) + "(" + + GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");"; + break; case OpMlal: s += "__a + " + MangleName("vmull", typestr, ClassS) + "(__b, __c);"; break; @@ -1541,6 +1581,7 @@ s += "__a - (__b * " + Duplicate(nElts, typestr, "__c") + ");"; break; case OpMlsLane: + case OpFMSLane: s += "__a - (__b * " + SplatLane(nElts, "__c", "__d") + ");"; break; case OpMls: @@ -1554,6 +1595,10 @@ s += "__a - " + MangleName("vmull", typestr, ClassS) + "(__b, " + SplatLane(nElts, "__c", "__d") + ");"; break; + case OpMlslHiLane: + s += "__a - " + MangleName("vmull", typestr, ClassS) + "(" + + GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");"; + break; case OpMlsl: s += "__a - " + MangleName("vmull", typestr, ClassS) + "(__b, __c);"; break; @@ -1564,14 +1609,26 @@ s += MangleName("vqdmull", typestr, ClassS) + "(__a, " + SplatLane(nElts, "__b", "__c") + ");"; break; + case OpQDMullHiLane: + s += MangleName("vqdmull", typestr, ClassS) + "(" + + GetHigh("__a", typestr) + ", " + SplatLane(nElts, "__b", "__c") + ");"; + break; case OpQDMlalLane: s += MangleName("vqdmlal", typestr, ClassS) + "(__a, __b, " + SplatLane(nElts, "__c", "__d") + ");"; break; + case OpQDMlalHiLane: + s += MangleName("vqdmlal", typestr, ClassS) + "(__a, " + + GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");"; + break; case OpQDMlslLane: s += MangleName("vqdmlsl", typestr, ClassS) + "(__a, __b, " + SplatLane(nElts, "__c", "__d") + ");"; break; + case OpQDMlslHiLane: + s += MangleName("vqdmlsl", typestr, ClassS) + "(__a, " + + GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");"; + break; case OpQDMulhLane: s += MangleName("vqdmulh", typestr, ClassS) + "(__a, " + SplatLane(nElts, "__b", "__c") + ");"; @@ -2072,20 +2129,28 @@ // Emit Neon vector typedefs. std::string TypedefTypes( - "cQcsQsiQilQlUcQUcUsQUsUiQUiUlQUlhQhfQfQdPcQPcPsQPs"); + "cQcsQsiQilQlUcQUcUsQUsUiQUiUlQUlhQhfQfdQdPcQPcPsQPs"); SmallVector TDTypeVec; ParseTypes(0, TypedefTypes, TDTypeVec); // Emit vector typedefs. + bool isA64 = false; for (unsigned i = 0, e = TDTypeVec.size(); i != e; ++i) { bool dummy, quad = false, poly = false; char type = ClassifyType(TDTypeVec[i], quad, poly, dummy); - bool isA64 = false; + bool preinsert = false; + bool postinsert = false; - if (type == 'd' && quad) + if (type == 'd') { + preinsert = isA64? false: true; isA64 = true; - - if (isA64) + } else { + postinsert = isA64? true: false; + isA64 = false; + } + if (postinsert) + OS << "#endif\n"; + if (preinsert) OS << "#ifdef __aarch64__\n"; if (poly) @@ -2101,22 +2166,28 @@ OS << TypeString('s', TDTypeVec[i]); OS << " " << TypeString('d', TDTypeVec[i]) << ";\n"; - if (isA64) - OS << "#endif\n"; } OS << "\n"; // Emit struct typedefs. + isA64 = false; for (unsigned vi = 2; vi != 5; ++vi) { for (unsigned i = 0, e = TDTypeVec.size(); i != e; ++i) { bool dummy, quad = false, poly = false; char type = ClassifyType(TDTypeVec[i], quad, poly, dummy); - bool isA64 = false; + bool preinsert = false; + bool postinsert = false; - if (type == 'd' && quad) + if (type == 'd') { + preinsert = isA64? false: true; isA64 = true; - - if (isA64) + } else { + postinsert = isA64? true: false; + isA64 = false; + } + if (postinsert) + OS << "#endif\n"; + if (preinsert) OS << "#ifdef __aarch64__\n"; std::string ts = TypeString('d', TDTypeVec[i]); @@ -2126,10 +2197,6 @@ OS << "[" << utostr(vi) << "]"; OS << ";\n} "; OS << vs << ";\n"; - - if (isA64) - OS << "#endif\n"; - OS << "\n"; } } @@ -2255,6 +2322,7 @@ case 'f': case 'i': return (2 << (int)quad) - 1; + case 'd': case 'l': return (1 << (int)quad) - 1; default: