diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -528,9 +528,16 @@ def VQDMULL_N : SOpInst<"vqdmull_n", "(>Q).1", "si", OP_QDMULL_N>; def VQDMULL_LANE : SOpInst<"vqdmull_lane", "(>Q)..I", "si", OP_QDMULL_LN>; def VQDMULH_N : SOpInst<"vqdmulh_n", "..1", "siQsQi", OP_QDMULH_N>; -def VQDMULH_LANE : SOpInst<"vqdmulh_lane", "..qI", "siQsQi", OP_QDMULH_LN>; def VQRDMULH_N : SOpInst<"vqrdmulh_n", "..1", "siQsQi", OP_QRDMULH_N>; + +let ArchGuard = "!defined(__aarch64__)" in { +def VQDMULH_LANE : SOpInst<"vqdmulh_lane", "..qI", "siQsQi", OP_QDMULH_LN>; def VQRDMULH_LANE : SOpInst<"vqrdmulh_lane", "..qI", "siQsQi", OP_QRDMULH_LN>; +} +let ArchGuard = "defined(__aarch64__)" in { +def A64_VQDMULH_LANE : SInst<"vqdmulh_lane", "..qI", "siQsQi">; +def A64_VQRDMULH_LANE : SInst<"vqrdmulh_lane", "..qI", "siQsQi">; +} let ArchGuard = "defined(__ARM_FEATURE_QRDMX)" in { def VQRDMLAH_LANE : SOpInst<"vqrdmlah_lane", "...qI", "siQsQi", OP_QRDMLAH_LN>; @@ -951,9 +958,10 @@ def VQDMULL_HIGH_LANEQ : SOpInst<"vqdmull_high_laneq", "(>Q)QQI", "si", OP_QDMULLHi_LN>; -def VQDMULH_LANEQ : SOpInst<"vqdmulh_laneq", "..QI", "siQsQi", OP_QDMULH_LN>; -def VQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "..QI", "siQsQi", OP_QRDMULH_LN>; - +let isLaneQ = 1 in { +def VQDMULH_LANEQ : SInst<"vqdmulh_laneq", "..QI", "siQsQi">; +def VQRDMULH_LANEQ : SInst<"vqrdmulh_laneq", "..QI", "siQsQi">; +} let ArchGuard = "defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__)" in { def VQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "...QI", "siQsQi", OP_QRDMLAH_LN>; def VQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "...QI", "siQsQi", OP_QRDMLSH_LN>; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -4969,14 +4969,22 @@ NEONMAP2(vqaddq_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts), NEONMAP2(vqdmlal_v, aarch64_neon_sqdmull, aarch64_neon_sqadd, 0), NEONMAP2(vqdmlsl_v, aarch64_neon_sqdmull, aarch64_neon_sqsub, 0), + NEONMAP1(vqdmulh_lane_v, aarch64_neon_sqdmulh_lane, 0), + NEONMAP1(vqdmulh_laneq_v, aarch64_neon_sqdmulh_laneq, 0), NEONMAP1(vqdmulh_v, aarch64_neon_sqdmulh, Add1ArgType), + NEONMAP1(vqdmulhq_lane_v, aarch64_neon_sqdmulh_lane, 0), + NEONMAP1(vqdmulhq_laneq_v, aarch64_neon_sqdmulh_laneq, 0), NEONMAP1(vqdmulhq_v, aarch64_neon_sqdmulh, Add1ArgType), NEONMAP1(vqdmull_v, aarch64_neon_sqdmull, Add1ArgType), NEONMAP2(vqmovn_v, aarch64_neon_uqxtn, aarch64_neon_sqxtn, Add1ArgType | UnsignedAlts), NEONMAP1(vqmovun_v, aarch64_neon_sqxtun, Add1ArgType), NEONMAP1(vqneg_v, aarch64_neon_sqneg, Add1ArgType), NEONMAP1(vqnegq_v, aarch64_neon_sqneg, Add1ArgType), + NEONMAP1(vqrdmulh_lane_v, aarch64_neon_sqrdmulh_lane, 0), + NEONMAP1(vqrdmulh_laneq_v, aarch64_neon_sqrdmulh_laneq, 0), NEONMAP1(vqrdmulh_v, aarch64_neon_sqrdmulh, Add1ArgType), + NEONMAP1(vqrdmulhq_lane_v, aarch64_neon_sqrdmulh_lane, 0), + NEONMAP1(vqrdmulhq_laneq_v, aarch64_neon_sqrdmulh_laneq, 0), NEONMAP1(vqrdmulhq_v, aarch64_neon_sqrdmulh, Add1ArgType), NEONMAP2(vqrshl_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts), NEONMAP2(vqrshlq_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts), @@ -5754,6 +5762,24 @@ Ops.resize(2); return EmitNeonCall(CGM.getIntrinsic(AltLLVMIntrinsic, Ty), Ops, NameHint); } + case NEON::BI__builtin_neon_vqdmulhq_lane_v: + case NEON::BI__builtin_neon_vqdmulh_lane_v: + case NEON::BI__builtin_neon_vqrdmulhq_lane_v: + case NEON::BI__builtin_neon_vqrdmulh_lane_v: { + llvm::Type *Tys[2] = { + Ty, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false, + /*isQuad*/ false))}; + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint); + } + case NEON::BI__builtin_neon_vqdmulhq_laneq_v: + case NEON::BI__builtin_neon_vqdmulh_laneq_v: + case NEON::BI__builtin_neon_vqrdmulhq_laneq_v: + case NEON::BI__builtin_neon_vqrdmulh_laneq_v: { + llvm::Type *Tys[2] = { + Ty, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false, + /*isQuad*/ true))}; + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint); + } case NEON::BI__builtin_neon_vqshl_n_v: case NEON::BI__builtin_neon_vqshlq_n_v: return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshl_n", diff --git a/clang/test/CodeGen/aarch64-neon-2velem.c b/clang/test/CodeGen/aarch64-neon-2velem.c --- a/clang/test/CodeGen/aarch64-neon-2velem.c +++ b/clang/test/CodeGen/aarch64-neon-2velem.c @@ -1440,12 +1440,12 @@ // CHECK-LABEL: @test_vqdmulh_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 3) +// CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANE_V2]] // int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) { return vqdmulh_lane_s16(a, v, 3); @@ -1453,12 +1453,12 @@ // CHECK-LABEL: @test_vqdmulhq_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 3) +// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANE_V2]] // int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) { return vqdmulhq_lane_s16(a, v, 3); @@ -1466,12 +1466,12 @@ // CHECK-LABEL: @test_vqdmulh_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 1) +// CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANE_V2]] // int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) { return vqdmulh_lane_s32(a, v, 1); @@ -1479,12 +1479,12 @@ // CHECK-LABEL: @test_vqdmulhq_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 1) +// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANE_V2]] // int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) { return vqdmulhq_lane_s32(a, v, 1); @@ -1492,12 +1492,12 @@ // CHECK-LABEL: @test_vqrdmulh_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 3) +// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANE_V2]] // int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) { return vqrdmulh_lane_s16(a, v, 3); @@ -1505,12 +1505,12 @@ // CHECK-LABEL: @test_vqrdmulhq_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 3) +// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANE_V2]] // int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) { return vqrdmulhq_lane_s16(a, v, 3); @@ -1518,12 +1518,12 @@ // CHECK-LABEL: @test_vqrdmulh_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 1) +// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANE_V2]] // int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) { return vqrdmulh_lane_s32(a, v, 1); @@ -1531,12 +1531,12 @@ // CHECK-LABEL: @test_vqrdmulhq_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 1) +// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANE_V2]] // int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) { return vqrdmulhq_lane_s32(a, v, 1); @@ -3066,12 +3066,12 @@ // CHECK-LABEL: @test_vqdmulh_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 0) +// CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANE_V2]] // int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) { return vqdmulh_lane_s16(a, v, 0); @@ -3079,12 +3079,12 @@ // CHECK-LABEL: @test_vqdmulhq_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 0) +// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANE_V2]] // int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) { return vqdmulhq_lane_s16(a, v, 0); @@ -3092,12 +3092,12 @@ // CHECK-LABEL: @test_vqdmulh_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 0) +// CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANE_V2]] // int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) { return vqdmulh_lane_s32(a, v, 0); @@ -3105,12 +3105,12 @@ // CHECK-LABEL: @test_vqdmulhq_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 0) +// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANE_V2]] // int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) { return vqdmulhq_lane_s32(a, v, 0); @@ -3118,12 +3118,12 @@ // CHECK-LABEL: @test_vqrdmulh_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 0) +// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANE_V2]] // int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) { return vqrdmulh_lane_s16(a, v, 0); @@ -3131,12 +3131,12 @@ // CHECK-LABEL: @test_vqrdmulhq_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 0) +// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANE_V2]] // int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) { return vqrdmulhq_lane_s16(a, v, 0); @@ -3144,12 +3144,12 @@ // CHECK-LABEL: @test_vqrdmulh_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 0) +// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANE_V2]] // int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) { return vqrdmulh_lane_s32(a, v, 0); @@ -3157,12 +3157,12 @@ // CHECK-LABEL: @test_vqrdmulhq_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 0) +// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANE_V2]] // int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) { return vqrdmulhq_lane_s32(a, v, 0); @@ -4753,12 +4753,12 @@ // CHECK-LABEL: @test_vqdmulh_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 0) +// CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANEQ_V2]] // int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) { return vqdmulh_laneq_s16(a, v, 0); @@ -4766,12 +4766,12 @@ // CHECK-LABEL: @test_vqdmulhq_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 0) +// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANEQ_V2]] // int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) { return vqdmulhq_laneq_s16(a, v, 0); @@ -4779,12 +4779,12 @@ // CHECK-LABEL: @test_vqdmulh_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 0) +// CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANEQ_V2]] // int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) { return vqdmulh_laneq_s32(a, v, 0); @@ -4792,12 +4792,12 @@ // CHECK-LABEL: @test_vqdmulhq_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 0) +// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANEQ_V2]] // int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) { return vqdmulhq_laneq_s32(a, v, 0); @@ -4805,12 +4805,12 @@ // CHECK-LABEL: @test_vqrdmulh_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 0) +// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANEQ_V2]] // int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) { return vqrdmulh_laneq_s16(a, v, 0); @@ -4818,12 +4818,12 @@ // CHECK-LABEL: @test_vqrdmulhq_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 0) +// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANEQ_V2]] // int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) { return vqrdmulhq_laneq_s16(a, v, 0); @@ -4831,12 +4831,12 @@ // CHECK-LABEL: @test_vqrdmulh_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 0) +// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANEQ_V2]] // int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) { return vqrdmulh_laneq_s32(a, v, 0); @@ -4844,12 +4844,12 @@ // CHECK-LABEL: @test_vqrdmulhq_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 0) +// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANEQ_V2]] // int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) { return vqrdmulhq_laneq_s32(a, v, 0); @@ -5149,12 +5149,12 @@ // CHECK-LABEL: @test_vqdmulh_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 7) +// CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANEQ_V2]] // int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) { return vqdmulh_laneq_s16(a, v, 7); @@ -5162,12 +5162,12 @@ // CHECK-LABEL: @test_vqdmulhq_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 7) +// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANEQ_V2]] // int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) { return vqdmulhq_laneq_s16(a, v, 7); @@ -5175,12 +5175,12 @@ // CHECK-LABEL: @test_vqdmulh_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 3) +// CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANEQ_V2]] // int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) { return vqdmulh_laneq_s32(a, v, 3); @@ -5188,12 +5188,12 @@ // CHECK-LABEL: @test_vqdmulhq_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 3) +// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANEQ_V2]] // int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) { return vqdmulhq_laneq_s32(a, v, 3); @@ -5201,12 +5201,12 @@ // CHECK-LABEL: @test_vqrdmulh_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 7) +// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANEQ_V2]] // int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) { return vqrdmulh_laneq_s16(a, v, 7); @@ -5214,12 +5214,12 @@ // CHECK-LABEL: @test_vqrdmulhq_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 7) +// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANEQ_V2]] // int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) { return vqrdmulhq_laneq_s16(a, v, 7); @@ -5227,12 +5227,12 @@ // CHECK-LABEL: @test_vqrdmulh_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 3) +// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANEQ_V2]] // int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) { return vqrdmulh_laneq_s32(a, v, 3); @@ -5240,12 +5240,12 @@ // CHECK-LABEL: @test_vqrdmulhq_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[SHUFFLE]]) #4 -// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 3) +// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANEQ_V2]] // int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) { return vqrdmulhq_laneq_s32(a, v, 3); diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -133,6 +133,10 @@ : Intrinsic<[llvm_anyvector_ty], [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty], [IntrNoMem]>; + class AdvSIMD_2VectorArg_Lane_Intrinsic + : Intrinsic<[llvm_anyint_ty], + [LLVMMatchType<0>, llvm_anyint_ty, llvm_i32_ty], + [IntrNoMem]>; class AdvSIMD_3VectorArg_Intrinsic : Intrinsic<[llvm_anyvector_ty], @@ -207,9 +211,13 @@ // Vector Saturating Doubling Multiply High def int_aarch64_neon_sqdmulh : AdvSIMD_2IntArg_Intrinsic; + def int_aarch64_neon_sqdmulh_lane : AdvSIMD_2VectorArg_Lane_Intrinsic; + def int_aarch64_neon_sqdmulh_laneq : AdvSIMD_2VectorArg_Lane_Intrinsic; // Vector Saturating Rounding Doubling Multiply High def int_aarch64_neon_sqrdmulh : AdvSIMD_2IntArg_Intrinsic; + def int_aarch64_neon_sqrdmulh_lane : AdvSIMD_2VectorArg_Lane_Intrinsic; + def int_aarch64_neon_sqrdmulh_laneq : AdvSIMD_2VectorArg_Lane_Intrinsic; // Vector Polynominal Multiply def int_aarch64_neon_pmul : AdvSIMD_2VectorArg_Intrinsic; diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -360,6 +360,9 @@ def am_indexedu6s128 : ComplexPattern; def am_indexeds9s128 : ComplexPattern; +def UImmS1XForm : SDNodeXFormgetTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i64); +}]>; def UImmS2XForm : SDNodeXFormgetTargetConstant(N->getZExtValue() / 2, SDLoc(N), MVT::i64); }]>; @@ -7968,6 +7971,64 @@ } } +multiclass SIMDIndexedHSPatterns { + + def : Pat<(v4i16 (OpNodeLane + (v4i16 V64:$Rn), (v4i16 V64_lo:$Rm), + VectorIndexS32b:$idx)), + (!cast(NAME # v4i16_indexed) $Rn, + (SUBREG_TO_REG (i32 0), (v4i16 V64_lo:$Rm), dsub), + (UImmS1XForm $idx))>; + + def : Pat<(v4i16 (OpNodeLaneQ + (v4i16 V64:$Rn), (v8i16 V128_lo:$Rm), + VectorIndexH32b:$idx)), + (!cast(NAME # v4i16_indexed) $Rn, $Rm, + (UImmS1XForm $idx))>; + + def : Pat<(v8i16 (OpNodeLane + (v8i16 V128:$Rn), (v4i16 V64_lo:$Rm), + VectorIndexS32b:$idx)), + (!cast(NAME # v8i16_indexed) $Rn, + (SUBREG_TO_REG (i32 0), $Rm, dsub), + (UImmS1XForm $idx))>; + + def : Pat<(v8i16 (OpNodeLaneQ + (v8i16 V128:$Rn), (v8i16 V128_lo:$Rm), + VectorIndexH32b:$idx)), + (!cast(NAME # v8i16_indexed) $Rn, $Rm, + (UImmS1XForm $idx))>; + + def : Pat<(v2i32 (OpNodeLane + (v2i32 V64:$Rn), (v2i32 V64:$Rm), + VectorIndexD32b:$idx)), + (!cast(NAME # v2i32_indexed) $Rn, + (SUBREG_TO_REG (i32 0), (v2i32 V64_lo:$Rm), dsub), + (UImmS1XForm $idx))>; + + def : Pat<(v2i32 (OpNodeLaneQ + (v2i32 V64:$Rn), (v4i32 V128:$Rm), + VectorIndexS32b:$idx)), + (!cast(NAME # v2i32_indexed) $Rn, $Rm, + (UImmS1XForm $idx))>; + + def : Pat<(v4i32 (OpNodeLane + (v4i32 V128:$Rn), (v2i32 V64:$Rm), + VectorIndexD32b:$idx)), + (!cast(NAME # v4i32_indexed) $Rn, + (SUBREG_TO_REG (i32 0), $Rm, dsub), + (UImmS1XForm $idx))>; + + def : Pat<(v4i32 (OpNodeLaneQ + (v4i32 V128:$Rn), + (v4i32 V128:$Rm), + VectorIndexS32b:$idx)), + (!cast(NAME # v4i32_indexed) $Rn, $Rm, + (UImmS1XForm $idx))>; + +} + multiclass SIMDIndexedHS opc, string asm, SDPatternOperator OpNode> { def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -5631,6 +5631,11 @@ defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>; defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>; +defm SQDMULH : SIMDIndexedHSPatterns; +defm SQRDMULH : SIMDIndexedHSPatterns; + // Generated by MachineCombine defm MLA : SIMDVectorIndexedHSTied<1, 0b0000, "mla", null_frag>; defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls", null_frag>; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp @@ -230,6 +230,7 @@ case AArch64::FPR16RegClassID: case AArch64::FPR32RegClassID: case AArch64::FPR64RegClassID: + case AArch64::FPR64_loRegClassID: case AArch64::FPR128RegClassID: case AArch64::FPR128_loRegClassID: case AArch64::DDRegClassID: diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -596,6 +596,7 @@ return 32; case AArch64::FPR128_loRegClassID: + case AArch64::FPR64_loRegClassID: return 16; } } diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -429,6 +429,10 @@ def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32, v1i64, v4f16], 64, (sequence "D%u", 0, 31)>; +def FPR64_lo : RegisterClass<"AArch64", + [v8i8, v4i16, v2i32, v1i64, v4f16, v2f32, v1f64], + 64, (trunc FPR64, 16)>; + // We don't (yet) have an f128 legal type, so don't use that here. We // normalize 128-bit vectors to v2f64 for arg passing and such, so use // that here. @@ -503,6 +507,9 @@ let Name = "VectorRegLo"; let PredicateMethod = "isNeonVectorRegLo"; } +def V64_lo : RegisterOperand { + let ParserMatchClass = VectorRegLoAsmOperand; +} def V128_lo : RegisterOperand { let ParserMatchClass = VectorRegLoAsmOperand; } diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -1033,8 +1033,10 @@ bool isNeonVectorRegLo() const { return Kind == k_Register && Reg.Kind == RegKind::NeonVector && - AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains( - Reg.RegNum); + (AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains( + Reg.RegNum) || + AArch64MCRegisterClasses[AArch64::FPR64_loRegClassID].contains( + Reg.RegNum)); } template bool isSVEVectorReg() const { diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll --- a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll @@ -9,20 +9,36 @@ declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>) declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32>, <2 x i32>, i32) +declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32>, <4 x i32>, i32) declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32>, <2 x i32>, i32) +declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32>, <4 x i32>, i32) declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16>, <4 x i16>, i32) +declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16>, <8 x i16>, i32) declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16>, <4 x i16>, i32) +declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16>, <8 x i16>, i32) declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32>, <2 x i32>, i32) +declare <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32>, <4 x i32>, i32) declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32>, <2 x i32>, i32) +declare <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32>, <4 x i32>, i32) declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16>, <4 x i16>, i32) +declare <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16>, <8 x i16>, i32) declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16>, <4 x i16>, i32) +declare <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16>, <8 x i16>, i32) declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) @@ -1515,6 +1531,37 @@ ret <4 x i16> %vqdmulh2.i } +define <4 x i16> @test_vqdmulh_lane_s16_intrinsic(<4 x i16> %a, <4 x i16> %v) { +; CHECK-LABEL: test_vqdmulh_lane_s16_intrinsic: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: sqdmulh v0.4h, v0.4h, v1.h[3] +; CHECK-NEXT: ret +entry: + %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> %a, <4 x i16> %v, i32 3) + ret <4 x i16> %vqdmulh2.i +} + +define <4 x i16> @test_vqdmulh_laneq_s16_intrinsic_lo(<4 x i16> %a, <8 x i16> %v) { +; CHECK-LABEL: test_vqdmulh_laneq_s16_intrinsic_lo: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqdmulh v0.4h, v0.4h, v1.h[3] +; CHECK-NEXT: ret +entry: + %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> %a, <8 x i16> %v, i32 3) + ret <4 x i16> %vqdmulh2.i +} + +define <4 x i16> @test_vqdmulh_laneq_s16_intrinsic_hi(<4 x i16> %a, <8 x i16> %v) { +; CHECK-LABEL: test_vqdmulh_laneq_s16_intrinsic_hi: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqdmulh v0.4h, v0.4h, v1.h[7] +; CHECK-NEXT: ret +entry: + %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> %a, <8 x i16> %v, i32 7) + ret <4 x i16> %vqdmulh2.i +} + define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmulhq_lane_s16: ; CHECK: // %bb.0: // %entry @@ -1527,6 +1574,37 @@ ret <8 x i16> %vqdmulh2.i } +define <8 x i16> @test_vqdmulhq_lane_s16_intrinsic(<8 x i16> %a, <4 x i16> %v) { +; CHECK-LABEL: test_vqdmulhq_lane_s16_intrinsic: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: sqdmulh v0.8h, v0.8h, v1.h[3] +; CHECK-NEXT: ret +entry: + %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> %a, <4 x i16> %v, i32 3) + ret <8 x i16> %vqdmulh2.i +} + +define <8 x i16> @test_vqdmulhq_laneq_s16_intrinsic_lo(<8 x i16> %a, <8 x i16> %v) { +; CHECK-LABEL: test_vqdmulhq_laneq_s16_intrinsic_lo: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqdmulh v0.8h, v0.8h, v1.h[3] +; CHECK-NEXT: ret +entry: + %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> %a, <8 x i16> %v, i32 3) + ret <8 x i16> %vqdmulh2.i +} + +define <8 x i16> @test_vqdmulhq_laneq_s16_intrinsic_hi(<8 x i16> %a, <8 x i16> %v) { +; CHECK-LABEL: test_vqdmulhq_laneq_s16_intrinsic_hi: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqdmulh v0.8h, v0.8h, v1.h[7] +; CHECK-NEXT: ret +entry: + %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> %a, <8 x i16> %v, i32 7) + ret <8 x i16> %vqdmulh2.i +} + define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmulh_lane_s32: ; CHECK: // %bb.0: // %entry @@ -1539,6 +1617,37 @@ ret <2 x i32> %vqdmulh2.i } +define <2 x i32> @test_vqdmulh_lane_s32_intrinsic(<2 x i32> %a, <2 x i32> %v) { +; CHECK-LABEL: test_vqdmulh_lane_s32_intrinsic: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: sqdmulh v0.2s, v0.2s, v1.s[1] +; CHECK-NEXT: ret +entry: + %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> %a, <2 x i32> %v, i32 1) + ret <2 x i32> %vqdmulh2.i +} + +define <2 x i32> @test_vqdmulh_laneq_s32_intrinsic_lo(<2 x i32> %a, <4 x i32> %v) { +; CHECK-LABEL: test_vqdmulh_laneq_s32_intrinsic_lo: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqdmulh v0.2s, v0.2s, v1.s[1] +; CHECK-NEXT: ret +entry: + %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> %a, <4 x i32> %v, i32 1) + ret <2 x i32> %vqdmulh2.i +} + +define <2 x i32> @test_vqdmulh_laneq_s32_intrinsic_hi(<2 x i32> %a, <4 x i32> %v) { +; CHECK-LABEL: test_vqdmulh_laneq_s32_intrinsic_hi: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqdmulh v0.2s, v0.2s, v1.s[3] +; CHECK-NEXT: ret +entry: + %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> %a, <4 x i32> %v, i32 3) + ret <2 x i32> %vqdmulh2.i +} + define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmulhq_lane_s32: ; CHECK: // %bb.0: // %entry @@ -1551,6 +1660,37 @@ ret <4 x i32> %vqdmulh2.i } +define <4 x i32> @test_vqdmulhq_lane_s32_intrinsic(<4 x i32> %a, <2 x i32> %v) { +; CHECK-LABEL: test_vqdmulhq_lane_s32_intrinsic: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: sqdmulh v0.4s, v0.4s, v1.s[1] +; CHECK-NEXT: ret +entry: + %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> %a, <2 x i32> %v, i32 1) + ret <4 x i32> %vqdmulh2.i +} + +define <4 x i32> @test_vqdmulhq_laneq_s32_intrinsic_lo(<4 x i32> %a, <4 x i32> %v) { +; CHECK-LABEL: test_vqdmulhq_laneq_s32_intrinsic_lo: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqdmulh v0.4s, v0.4s, v1.s[1] +; CHECK-NEXT: ret +entry: + %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> %a, <4 x i32> %v, i32 1) + ret <4 x i32> %vqdmulh2.i +} + +define <4 x i32> @test_vqdmulhq_laneq_s32_intrinsic_hi(<4 x i32> %a, <4 x i32> %v) { +; CHECK-LABEL: test_vqdmulhq_laneq_s32_intrinsic_hi: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqdmulh v0.4s, v0.4s, v1.s[3] +; CHECK-NEXT: ret +entry: + %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> %a, <4 x i32> %v, i32 3) + ret <4 x i32> %vqdmulh2.i +} + define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vqrdmulh_lane_s16: ; CHECK: // %bb.0: // %entry @@ -1563,6 +1703,37 @@ ret <4 x i16> %vqrdmulh2.i } +define <4 x i16> @test_vqrdmulh_lane_s16_intrinsic(<4 x i16> %a, <4 x i16> %v) { +; CHECK-LABEL: test_vqrdmulh_lane_s16_intrinsic: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.h[3] +; CHECK-NEXT: ret +entry: + %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> %a, <4 x i16> %v, i32 3) + ret <4 x i16> %vqrdmulh2.i +} + +define <4 x i16> @test_vqrdmulh_laneq_s16_intrinsic_lo(<4 x i16> %a, <8 x i16> %v) { +; CHECK-LABEL: test_vqrdmulh_laneq_s16_intrinsic_lo: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.h[3] +; CHECK-NEXT: ret +entry: + %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> %a, <8 x i16> %v, i32 3) + ret <4 x i16> %vqrdmulh2.i +} + +define <4 x i16> @test_vqrdmulh_laneq_s16_intrinsic_hi(<4 x i16> %a, <8 x i16> %v) { +; CHECK-LABEL: test_vqrdmulh_laneq_s16_intrinsic_hi: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqrdmulh v0.4h, v0.4h, v1.h[7] +; CHECK-NEXT: ret +entry: + %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> %a, <8 x i16> %v, i32 7) + ret <4 x i16> %vqrdmulh2.i +} + define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vqrdmulhq_lane_s16: ; CHECK: // %bb.0: // %entry @@ -1575,6 +1746,37 @@ ret <8 x i16> %vqrdmulh2.i } +define <8 x i16> @test_vqrdmulhq_lane_s16_intrinsic(<8 x i16> %a, <4 x i16> %v) { +; CHECK-LABEL: test_vqrdmulhq_lane_s16_intrinsic: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: sqrdmulh v0.8h, v0.8h, v1.h[3] +; CHECK-NEXT: ret +entry: + %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> %a, <4 x i16> %v, i32 3) + ret <8 x i16> %vqrdmulh2.i +} + +define <8 x i16> @test_vqrdmulhq_laneq_s16_intrinsic_lo(<8 x i16> %a, <8 x i16> %v) { +; CHECK-LABEL: test_vqrdmulhq_laneq_s16_intrinsic_lo: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqrdmulh v0.8h, v0.8h, v1.h[3] +; CHECK-NEXT: ret +entry: + %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> %a, <8 x i16> %v, i32 3) + ret <8 x i16> %vqrdmulh2.i +} + +define <8 x i16> @test_vqrdmulhq_laneq_s16_intrinsic_hi(<8 x i16> %a, <8 x i16> %v) { +; CHECK-LABEL: test_vqrdmulhq_laneq_s16_intrinsic_hi: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqrdmulh v0.8h, v0.8h, v1.h[7] +; CHECK-NEXT: ret +entry: + %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> %a, <8 x i16> %v, i32 7) + ret <8 x i16> %vqrdmulh2.i +} + define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqrdmulh_lane_s32: ; CHECK: // %bb.0: // %entry @@ -1587,6 +1789,37 @@ ret <2 x i32> %vqrdmulh2.i } +define <2 x i32> @test_vqrdmulh_lane_s32_intrinsic(<2 x i32> %a, <2 x i32> %v) { +; CHECK-LABEL: test_vqrdmulh_lane_s32_intrinsic: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: sqrdmulh v0.2s, v0.2s, v1.s[1] +; CHECK-NEXT: ret +entry: + %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> %a, <2 x i32> %v, i32 1) + ret <2 x i32> %vqrdmulh2.i +} + +define <2 x i32> @test_vqrdmulh_laneq_s32_intrinsic_lo(<2 x i32> %a, <4 x i32> %v) { +; CHECK-LABEL: test_vqrdmulh_laneq_s32_intrinsic_lo: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqrdmulh v0.2s, v0.2s, v1.s[1] +; CHECK-NEXT: ret +entry: + %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> %a, <4 x i32> %v, i32 1) + ret <2 x i32> %vqrdmulh2.i +} + +define <2 x i32> @test_vqrdmulh_laneq_s32_intrinsic_hi(<2 x i32> %a, <4 x i32> %v) { +; CHECK-LABEL: test_vqrdmulh_laneq_s32_intrinsic_hi: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqrdmulh v0.2s, v0.2s, v1.s[3] +; CHECK-NEXT: ret +entry: + %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> %a, <4 x i32> %v, i32 3) + ret <2 x i32> %vqrdmulh2.i +} + define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqrdmulhq_lane_s32: ; CHECK: // %bb.0: // %entry @@ -1599,6 +1832,37 @@ ret <4 x i32> %vqrdmulh2.i } +define <4 x i32> @test_vqrdmulhq_lane_s32_intrinsic(<4 x i32> %a, <2 x i32> %v) { +; CHECK-LABEL: test_vqrdmulhq_lane_s32_intrinsic: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.s[1] +; CHECK-NEXT: ret +entry: + %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> %a, <2 x i32> %v, i32 1) + ret <4 x i32> %vqrdmulh2.i +} + +define <4 x i32> @test_vqrdmulhq_laneq_s32_intrinsic_lo(<4 x i32> %a, <4 x i32> %v) { +; CHECK-LABEL: test_vqrdmulhq_laneq_s32_intrinsic_lo: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.s[1] +; CHECK-NEXT: ret +entry: + %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> %a, <4 x i32> %v, i32 1) + ret <4 x i32> %vqrdmulh2.i +} + +define <4 x i32> @test_vqrdmulhq_laneq_s32_intrinsic_hi(<4 x i32> %a, <4 x i32> %v) { +; CHECK-LABEL: test_vqrdmulhq_laneq_s32_intrinsic_hi: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqrdmulh v0.4s, v0.4s, v1.s[3] +; CHECK-NEXT: ret +entry: + %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> %a, <4 x i32> %v, i32 3) + ret <4 x i32> %vqrdmulh2.i +} + define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) { ; CHECK-LABEL: test_vmul_lane_f32: ; CHECK: // %bb.0: // %entry