Index: test/CodeGen/aarch64-v8.1a-neon-intrinsics.c
===================================================================
--- test/CodeGen/aarch64-v8.1a-neon-intrinsics.c
+++ test/CodeGen/aarch64-v8.1a-neon-intrinsics.c
@@ -2,127 +2,227 @@
 
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon \
 // RUN:  -target-feature +v8.1a -O3 -S -o - %s \
-// RUN:  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AARCH64
+// RUN:  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ASM
+
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon \
+// RUN:  -target-feature +v8.1a -O1 -S -emit-llvm -o - %s \
+// RUN:  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-IR
 
  #include <arm_neon.h>
 
-// CHECK-AARCH64-LABEL: test_vqrdmlah_laneq_s16
+// CHECK-LABEL: test_vqrdmlah_laneq_s16
 int16x4_t test_vqrdmlah_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
-// CHECK-AARCH64: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+// CHECK-ASM: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+
+// CHECK-IR: [[shuffle:%.*]] = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-IR: [[mul:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %b, <4 x i16> [[shuffle]])
+// CHECK-IR: tail call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %a, <4 x i16> [[mul]])
   return vqrdmlah_laneq_s16(a, b, v, 7);
 }
 
-// CHECK-AARCH64-LABEL: test_vqrdmlah_laneq_s32
+// CHECK-LABEL: test_vqrdmlah_laneq_s32
 int32x2_t test_vqrdmlah_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
-// CHECK-AARCH64: sqrdmlah {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+// CHECK-ASM: sqrdmlah {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+
+// CHECK-IR: [[shuffle:%.*]] = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+// CHECK-IR: [[mul:%.*]] = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %b, <2 x i32> [[shuffle]])
+// CHECK-IR: tail call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %a, <2 x i32> [[mul]])
   return vqrdmlah_laneq_s32(a, b, v, 3);
 }
 
-// CHECK-AARCH64-LABEL: test_vqrdmlahq_laneq_s16
+// CHECK-LABEL: test_vqrdmlahq_laneq_s16
 int16x8_t test_vqrdmlahq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
-// CHECK-AARCH64: sqrdmlah {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+// CHECK-ASM: sqrdmlah {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+
+// CHECK-IR: [[shuffle:%.*]] = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-IR: [[mul:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %b, <8 x i16> [[shuffle]])
+// CHECK-IR: tail call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %a, <8 x i16> [[mul]])
   return vqrdmlahq_laneq_s16(a, b, v, 7);
 }
 
-// CHECK-AARCH64-LABEL: test_vqrdmlahq_laneq_s32
+// CHECK-LABEL: test_vqrdmlahq_laneq_s32
 int32x4_t test_vqrdmlahq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
-// CHECK-AARCH64: sqrdmlah {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+// CHECK-ASM: sqrdmlah {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+
+// CHECK-IR: [[shuffle:%.*]] = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-IR: [[mul:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %b, <4 x i32> [[shuffle]])
+// CHECK-IR: tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[mul]])
   return vqrdmlahq_laneq_s32(a, b, v, 3);
 }
 
-// CHECK-AARCH64-LABEL: test_vqrdmlahh_s16
+// CHECK-LABEL: test_vqrdmlahh_s16
 int16_t test_vqrdmlahh_s16(int16_t a, int16_t b, int16_t c) {
-// CHECK-AARCH64: sqrdmlah {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
+// CHECK-ASM: sqrdmlah {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
+
+// CHECK-IR: [[insb:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK-IR: [[insc:%.*]] = insertelement <4 x i16> undef, i16 %c, i64 0
+// CHECK-IR: [[mul:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[insb]], <4 x i16> [[insc]])
+// CHECK-IR: [[insa:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK-IR: [[add:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[insa]], <4 x i16> [[mul]])
+// CHECK-IR: extractelement <4 x i16> [[add]], i64 0
   return vqrdmlahh_s16(a, b, c);
 }
 
-// CHECK-AARCH64-LABEL: test_vqrdmlahs_s32
+// CHECK-LABEL: test_vqrdmlahs_s32
 int32_t test_vqrdmlahs_s32(int32_t a, int32_t b, int32_t c) {
-// CHECK-AARCH64: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+// CHECK-ASM: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+
+// CHECK-IR: [[mul:%.*]] = tail call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %b, i32 %c)
+// CHECK-IR: tail call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[mul]])
   return vqrdmlahs_s32(a, b, c);
 }
 
-// CHECK-AARCH64-LABEL: test_vqrdmlahh_lane_s16
+// CHECK-LABEL: test_vqrdmlahh_lane_s16
 int16_t test_vqrdmlahh_lane_s16(int16_t a, int16_t b, int16x4_t c) {
-// CHECK-AARCH64: sqrdmlah {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3]
+// CHECK-ASM: sqrdmlah {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3]
+
+// CHECK-IR: [[insb:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK-IR: [[shuffle:%.*]] = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
+// CHECK-IR: [[mul:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[insb]], <4 x i16> [[shuffle]])
+// CHECK-IR: [[insa:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK-IR: [[add:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[insa]], <4 x i16> [[mul]])
+// CHECK-IR: extractelement <4 x i16> [[add]], i64 0
   return vqrdmlahh_lane_s16(a, b, c, 3);
 }
 
-// CHECK-AARCH64-LABEL: test_vqrdmlahs_lane_s32
+// CHECK-LABEL: test_vqrdmlahs_lane_s32
 int32_t test_vqrdmlahs_lane_s32(int32_t a, int32_t b, int32x2_t c) {
-// CHECK-AARCH64: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+// CHECK-ASM: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+
+// CHECK-IR: [[extc:%.*]] = extractelement <2 x i32> %c, i32 1
+// CHECK-IR: [[mul:%.*]] = tail call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %b, i32 [[extc]])
+// CHECK-IR: tail call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[mul]])
   return vqrdmlahs_lane_s32(a, b, c, 1);
 }
 
-// CHECK-AARCH64-LABEL: test_vqrdmlahh_laneq_s16
+// CHECK-LABEL: test_vqrdmlahh_laneq_s16
 int16_t test_vqrdmlahh_laneq_s16(int16_t a, int16_t b, int16x8_t c) {
-// CHECK-AARCH64: sqrdmlah {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
+// CHECK-ASM: sqrdmlah {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
+
+// CHECK-IR: [[insb:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK-IR: [[shuffle:%.*]] = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 7, i32 undef, i32 undef, i32 undef>
+// CHECK-IR: [[mul:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[insb]], <4 x i16> [[shuffle]])
+// CHECK-IR: [[insa:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK-IR: [[add:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[insa]], <4 x i16> [[mul]])
+// CHECK-IR: extractelement <4 x i16> [[add]], i64 0
   return vqrdmlahh_laneq_s16(a, b, c, 7);
 }
 
-// CHECK-AARCH64-LABEL: test_vqrdmlahs_laneq_s32
+// CHECK-LABEL: test_vqrdmlahs_laneq_s32
 int32_t test_vqrdmlahs_laneq_s32(int32_t a, int32_t b, int32x4_t c) {
-// CHECK-AARCH64: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+// CHECK-ASM: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+
+// CHECK-IR: [[extc:%.*]] = extractelement <4 x i32> %c, i32 3
+// CHECK-IR: [[mul:%.*]] = tail call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %b, i32 [[extc]])
+// CHECK-IR: tail call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[mul]])
   return vqrdmlahs_laneq_s32(a, b, c, 3);
 }
 
-// CHECK-AARCH64-LABEL: test_vqrdmlsh_laneq_s16
+// CHECK-LABEL: test_vqrdmlsh_laneq_s16
 int16x4_t test_vqrdmlsh_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
-// CHECK-AARCH64: sqrdmlsh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+// CHECK-ASM: sqrdmlsh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+
+// CHECK-IR: [[shuffle:%.*]] = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-IR: [[mul:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %b, <4 x i16> [[shuffle]])
+// CHECK-IR: tail call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %a, <4 x i16> [[mul]])
   return vqrdmlsh_laneq_s16(a, b, v, 7);
 }
 
-// CHECK-AARCH64-LABEL: test_vqrdmlsh_laneq_s32
+// CHECK-LABEL: test_vqrdmlsh_laneq_s32
 int32x2_t test_vqrdmlsh_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
-// CHECK-AARCH64: sqrdmlsh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+// CHECK-ASM: sqrdmlsh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+
+// CHECK-IR: [[shuffle:%.*]] = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+// CHECK-IR: [[mul:%.*]] = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %b, <2 x i32> [[shuffle]])
+// CHECK-IR: tail call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %a, <2 x i32> [[mul]])
   return vqrdmlsh_laneq_s32(a, b, v, 3);
 }
 
-// CHECK-AARCH64-LABEL: test_vqrdmlshq_laneq_s16
+// CHECK-LABEL: test_vqrdmlshq_laneq_s16
 int16x8_t test_vqrdmlshq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
-// CHECK-AARCH64: sqrdmlsh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+// CHECK-ASM: sqrdmlsh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+
+// CHECK-IR: [[shuffle:%.*]] = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-IR: [[mul:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %b, <8 x i16> [[shuffle]])
+// CHECK-IR: tail call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %a, <8 x i16> [[mul]])
   return vqrdmlshq_laneq_s16(a, b, v, 7);
 }
 
-// CHECK-AARCH64-LABEL: test_vqrdmlshq_laneq_s32
+// CHECK-LABEL: test_vqrdmlshq_laneq_s32
 int32x4_t test_vqrdmlshq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
-// CHECK-AARCH64: sqrdmlsh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+// CHECK-ASM: sqrdmlsh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+
+// CHECK-IR: [[shuffle:%.*]] = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-IR: [[mul:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %b, <4 x i32> [[shuffle]])
+// CHECK-IR: tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[mul]])
   return vqrdmlshq_laneq_s32(a, b, v, 3);
 }
 
-// CHECK-AARCH64-LABEL: test_vqrdmlshh_s16
+// CHECK-LABEL: test_vqrdmlshh_s16
 int16_t test_vqrdmlshh_s16(int16_t a, int16_t b, int16_t c) {
-// CHECK-AARCH64: sqrdmlsh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
+// CHECK-ASM: sqrdmlsh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
+
+// CHECK-IR: [[insb:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK-IR: [[insc:%.*]] = insertelement <4 x i16> undef, i16 %c, i64 0
+// CHECK-IR: [[mul:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[insb]], <4 x i16> [[insc]])
+// CHECK-IR: [[insa:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK-IR: [[sub:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[insa]], <4 x i16> [[mul]])
+// CHECK-IR: extractelement <4 x i16> [[sub]], i64 0
   return vqrdmlshh_s16(a, b, c);
 }
 
-// CHECK-AARCH64-LABEL: test_vqrdmlshs_s32
+// CHECK-LABEL: test_vqrdmlshs_s32
 int32_t test_vqrdmlshs_s32(int32_t a, int32_t b, int32_t c) {
-// CHECK-AARCH64: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+// CHECK-ASM: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+
+// CHECK-IR: [[mul:%.*]] = tail call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %b, i32 %c)
+// CHECK-IR: tail call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[mul]])
   return vqrdmlshs_s32(a, b, c);
 }
 
-// CHECK-AARCH64-LABEL: test_vqrdmlshh_lane_s16
+// CHECK-LABEL: test_vqrdmlshh_lane_s16
 int16_t test_vqrdmlshh_lane_s16(int16_t a, int16_t b, int16x4_t c) {
-// CHECK-AARCH64: sqrdmlsh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3]
+// CHECK-ASM: sqrdmlsh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3]
+
+// CHECK-IR: [[insb:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK-IR: [[shuffle:%.*]] = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
+// CHECK-IR: [[mul:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[insb]], <4 x i16> [[shuffle]])
+// CHECK-IR: [[insa:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK-IR: [[sub:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[insa]], <4 x i16> [[mul]])
+// CHECK-IR: extractelement <4 x i16> [[sub]], i64 0
   return vqrdmlshh_lane_s16(a, b, c, 3);
 }
 
-// CHECK-AARCH64-LABEL: test_vqrdmlshs_lane_s32
+// CHECK-LABEL: test_vqrdmlshs_lane_s32
 int32_t test_vqrdmlshs_lane_s32(int32_t a, int32_t b, int32x2_t c) {
-// CHECK-AARCH64: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+// CHECK-ASM: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+
+// CHECK-IR: [[extc:%.*]] = extractelement <2 x i32> %c, i32 1
+// CHECK-IR: [[mul:%.*]] = tail call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %b, i32 [[extc]])
+// CHECK-IR: tail call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[mul]])
   return vqrdmlshs_lane_s32(a, b, c, 1);
 }
 
-// CHECK-AARCH64-LABEL: test_vqrdmlshh_laneq_s16
+// CHECK-LABEL: test_vqrdmlshh_laneq_s16
 int16_t test_vqrdmlshh_laneq_s16(int16_t a, int16_t b, int16x8_t c) {
-// CHECK-AARCH64: sqrdmlsh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
+// CHECK-ASM: sqrdmlsh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
+
+// CHECK-IR: [[insb:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK-IR: [[shuffle:%.*]] = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 7, i32 undef, i32 undef, i32 undef>
+// CHECK-IR: [[mul:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[insb]], <4 x i16> [[shuffle]])
+// CHECK-IR: [[insa:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK-IR: [[sub:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[insa]], <4 x i16> [[mul]])
+// CHECK-IR: extractelement <4 x i16> [[sub]], i64 0
   return vqrdmlshh_laneq_s16(a, b, c, 7);
 }
 
-// CHECK-AARCH64-LABEL: test_vqrdmlshs_laneq_s32
+// CHECK-LABEL: test_vqrdmlshs_laneq_s32
 int32_t test_vqrdmlshs_laneq_s32(int32_t a, int32_t b, int32x4_t c) {
-// CHECK-AARCH64: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+// CHECK-ASM: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+
+// CHECK-IR: [[extc:%.*]] = extractelement <4 x i32> %c, i32 3
+// CHECK-IR: [[mul:%.*]] = tail call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %b, i32 [[extc]])
+// CHECK-IR: tail call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[mul]])
   return vqrdmlshs_laneq_s32(a, b, c, 3);
 }
 
Index: test/CodeGen/arm-v8.1a-neon-intrinsics.c
===================================================================
--- test/CodeGen/arm-v8.1a-neon-intrinsics.c
+++ test/CodeGen/arm-v8.1a-neon-intrinsics.c
@@ -1,122 +1,244 @@
 // RUN: %clang_cc1 -triple armv8.1a-linux-gnu -target-feature +neon \
 // RUN:  -O3 -S -o - %s \
-// RUN:  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM
+// RUN:  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM-ASM
+
+// RUN: %clang_cc1 -triple armv8.1a-linux-gnu -target-feature +neon \
+// RUN:  -O1 -S -emit-llvm -o - %s \
+// RUN:  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM-IR
+
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon \
 // RUN:  -target-feature +v8.1a -O3 -S -o - %s \
-// RUN:  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AARCH64
+// RUN:  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AARCH64-ASM
+
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon \
+// RUN:  -target-feature +v8.1a -O1 -S -emit-llvm -o - %s \
+// RUN:  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AARCH64-IR
+
 // REQUIRES: arm-registered-target,aarch64-registered-target
 
 #include <arm_neon.h>
 
 // CHECK-LABEL: test_vqrdmlah_s16
 int16x4_t test_vqrdmlah_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
-// CHECK-ARM: vqrdmlah.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-// CHECK-AARCH64: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+// CHECK-ARM-ASM: vqrdmlah.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-AARCH64-ASM: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+
+// CHECK-ARM-IR: [[mul:%.*]] = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %b, <4 x i16> %c)
+// CHECK-ARM-IR: tail call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %a, <4 x i16> [[mul]])
+
+// CHECK-AARCH64-IR: [[mul:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %b, <4 x i16> %c)
+// CHECK-AARCH64-IR: tail call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %a, <4 x i16> [[mul]])
   return vqrdmlah_s16(a, b, c);
 }
 
 // CHECK-LABEL: test_vqrdmlah_s32
 int32x2_t test_vqrdmlah_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
-// CHECK-ARM: vqrdmlah.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-// CHECK-AARCH64: sqrdmlah {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+// CHECK-ARM-ASM: vqrdmlah.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-AARCH64-ASM: sqrdmlah {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+
+// CHECK-ARM-IR: [[mul:%.*]] = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %b, <2 x i32> %c)
+// CHECK-ARM-IR: tail call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %a, <2 x i32> [[mul]])
+
+// CHECK-AARCH64-IR: [[mul:%.*]] = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %b, <2 x i32> %c)
+// CHECK-AARCH64-IR: tail call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %a, <2 x i32> [[mul]])
   return vqrdmlah_s32(a, b, c);
 }
 
 // CHECK-LABEL: test_vqrdmlahq_s16
 int16x8_t test_vqrdmlahq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
-// CHECK-ARM: vqrdmlah.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
-// CHECK-AARCH64: sqrdmlah {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+// CHECK-ARM-ASM: vqrdmlah.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-AARCH64-ASM: sqrdmlah {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+
+// CHECK-ARM-IR: [[mul:%.*]] = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %b, <8 x i16> %c)
+// CHECK-ARM-IR: tail call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %a, <8 x i16> [[mul]])
+
+// CHECK-AARCH64-IR: [[mul:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %b, <8 x i16> %c)
+// CHECK-AARCH64-IR: tail call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %a, <8 x i16> [[mul]])
   return vqrdmlahq_s16(a, b, c);
 }
 
 // CHECK-LABEL: test_vqrdmlahq_s32
 int32x4_t test_vqrdmlahq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
-// CHECK-ARM: vqrdmlah.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
-// CHECK-AARCH64: sqrdmlah {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+// CHECK-ARM-ASM: vqrdmlah.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-AARCH64-ASM: sqrdmlah {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+
+// CHECK-ARM-IR: [[mul:%.*]] = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %b, <4 x i32> %c)
+// CHECK-ARM-IR: tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> [[mul]])
+
+// CHECK-AARCH64-IR: [[mul:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %b, <4 x i32> %c)
+// CHECK-AARCH64-IR: tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[mul]])
   return vqrdmlahq_s32(a, b, c);
 }
 
 // CHECK-LABEL: test_vqrdmlah_lane_s16
 int16x4_t test_vqrdmlah_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
-// CHECK-ARM: vqrdmlah.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[3]
-// CHECK-AARCH64: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+// CHECK-ARM-ASM: vqrdmlah.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[3]
+// CHECK-AARCH64-ASM: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+
+// CHECK-ARM-IR: [[shuffle:%.*]] = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-ARM-IR: [[mul:%.*]] = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %b, <4 x i16> [[shuffle]])
+// CHECK-ARM-IR: tail call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %a, <4 x i16> [[mul]])
+
+// CHECK-AARCH64-IR: [[shuffle:%.*]] = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-AARCH64-IR: [[mul:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %b, <4 x i16> [[shuffle]])
+// CHECK-AARCH64-IR: tail call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %a, <4 x i16> [[mul]])
   return vqrdmlah_lane_s16(a, b, c, 3);
 }
 
 // CHECK-LABEL: test_vqrdmlah_lane_s32
 int32x2_t test_vqrdmlah_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
-// CHECK-ARM: vqrdmlah.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[1]
-// CHECK-AARCH64: sqrdmlah {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+// CHECK-ARM-ASM: vqrdmlah.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[1]
+// CHECK-AARCH64-ASM: sqrdmlah {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+
+// CHECK-ARM-IR: [[shuffle:%.*]] = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK-ARM-IR: [[mul:%.*]] = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %b, <2 x i32> [[shuffle]])
+// CHECK-ARM-IR: tail call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %a, <2 x i32> [[mul]])
+
+// CHECK-AARCH64-IR: [[shuffle:%.*]] = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK-AARCH64-IR: [[mul:%.*]] = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %b, <2 x i32> [[shuffle]])
+// CHECK-AARCH64-IR: tail call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %a, <2 x i32> [[mul]])
   return vqrdmlah_lane_s32(a, b, c, 1);
 }
 
 // CHECK-LABEL: test_vqrdmlahq_lane_s16
 int16x8_t test_vqrdmlahq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
-// CHECK-ARM: vqrdmlah.s16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[3]
-// CHECK-AARCH64: sqrdmlah {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+// CHECK-ARM-ASM: vqrdmlah.s16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[3]
+// CHECK-AARCH64-ASM: sqrdmlah {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+
+// CHECK-ARM-IR: [[shuffle:%.*]] = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-ARM-IR: [[mul:%.*]] = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %b, <8 x i16> [[shuffle]])
+// CHECK-ARM-IR: tail call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %a, <8 x i16> [[mul]])
+
+// CHECK-AARCH64-IR: [[shuffle:%.*]] = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-AARCH64-IR: [[mul:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %b, <8 x i16> [[shuffle]])
+// CHECK-AARCH64-IR: tail call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %a, <8 x i16> [[mul]])
   return vqrdmlahq_lane_s16(a, b, c, 3);
 }
 
 // CHECK-LABEL: test_vqrdmlahq_lane_s32
 int32x4_t test_vqrdmlahq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
-// CHECK-ARM: vqrdmlah.s32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[1]
-// CHECK-AARCH64: sqrdmlah {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+// CHECK-ARM-ASM: vqrdmlah.s32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[1]
+// CHECK-AARCH64-ASM: sqrdmlah {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+
+// CHECK-ARM-IR: [[shuffle:%.*]] = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-ARM-IR: [[mul:%.*]] = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %b, <4 x i32> [[shuffle]])
+// CHECK-ARM-IR: tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> [[mul]])
+
+// CHECK-AARCH64-IR: [[shuffle:%.*]] = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-AARCH64-IR: [[mul:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %b, <4 x i32> [[shuffle]])
+// CHECK-AARCH64-IR: tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[mul]])
   return vqrdmlahq_lane_s32(a, b, c, 1);
 }
 
 // CHECK-LABEL: test_vqrdmlsh_s16
 int16x4_t test_vqrdmlsh_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
-// CHECK-ARM: vqrdmlsh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-// CHECK-AARCH64: sqrdmlsh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+// CHECK-ARM-ASM: vqrdmlsh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-AARCH64-ASM: sqrdmlsh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+
+// CHECK-ARM-IR: [[mul:%.*]] = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %b, <4 x i16> %c)
+// CHECK-ARM-IR: tail call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %a, <4 x i16> [[mul]])
+
+// CHECK-AARCH64-IR: [[mul:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %b, <4 x i16> %c)
+// CHECK-AARCH64-IR: tail call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %a, <4 x i16> [[mul]])
   return vqrdmlsh_s16(a, b, c);
 }
 
 // CHECK-LABEL: test_vqrdmlsh_s32
 int32x2_t test_vqrdmlsh_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
-// CHECK-ARM: vqrdmlsh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-// CHECK-AARCH64: sqrdmlsh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+// CHECK-ARM-ASM: vqrdmlsh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-AARCH64-ASM: sqrdmlsh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+
+// CHECK-ARM-IR: [[mul:%.*]] = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %b, <2 x i32> %c)
+// CHECK-ARM-IR: tail call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %a, <2 x i32> [[mul]])
+
+// CHECK-AARCH64-IR: [[mul:%.*]] = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %b, <2 x i32> %c)
+// CHECK-AARCH64-IR: tail call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %a, <2 x i32> [[mul]])
   return vqrdmlsh_s32(a, b, c);
 }
 
 // CHECK-LABEL: test_vqrdmlshq_s16
 int16x8_t test_vqrdmlshq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
-// CHECK-ARM: vqrdmlsh.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
-// CHECK-AARCH64: sqrdmlsh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+// CHECK-ARM-ASM: vqrdmlsh.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-AARCH64-ASM: sqrdmlsh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+
+// CHECK-ARM-IR: [[mul:%.*]] = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %b, <8 x i16> %c)
+// CHECK-ARM-IR: tail call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %a, <8 x i16> [[mul]])
+
+// CHECK-AARCH64-IR: [[mul:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %b, <8 x i16> %c)
+// CHECK-AARCH64-IR: tail call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %a, <8 x i16> [[mul]])
   return vqrdmlshq_s16(a, b, c);
 }
 
 // CHECK-LABEL: test_vqrdmlshq_s32
 int32x4_t test_vqrdmlshq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
-// CHECK-ARM: vqrdmlsh.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
-// CHECK-AARCH64: sqrdmlsh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+// CHECK-ARM-ASM: vqrdmlsh.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-AARCH64-ASM: sqrdmlsh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+
+// CHECK-ARM-IR: [[mul:%.*]] = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %b, <4 x i32> %c)
+// CHECK-ARM-IR: tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> [[mul]])
+
+// CHECK-AARCH64-IR: [[mul:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %b, <4 x i32> %c)
+// CHECK-AARCH64-IR: tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[mul]])
   return vqrdmlshq_s32(a, b, c);
 }
 
 // CHECK-LABEL: test_vqrdmlsh_lane_s16
 int16x4_t test_vqrdmlsh_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
-// CHECK-ARM: vqrdmlsh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[3]
-// CHECK-AARCH64: sqrdmlsh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+// CHECK-ARM-ASM: vqrdmlsh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[3]
+// CHECK-AARCH64-ASM: sqrdmlsh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+
+// CHECK-ARM-IR: [[shuffle:%.*]] = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-ARM-IR: [[mul:%.*]] = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %b, <4 x i16> [[shuffle]])
+// CHECK-ARM-IR: tail call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %a, <4 x i16> [[mul]])
+
+// CHECK-AARCH64-IR: [[shuffle:%.*]] = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-AARCH64-IR: [[mul:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %b, <4 x i16> [[shuffle]])
+// CHECK-AARCH64-IR: tail call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %a, <4 x i16> [[mul]])
   return vqrdmlsh_lane_s16(a, b, c, 3);
 }
 
 // CHECK-LABEL: test_vqrdmlsh_lane_s32
 int32x2_t test_vqrdmlsh_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
-// CHECK-ARM: vqrdmlsh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[1]
-// CHECK-AARCH64: sqrdmlsh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+// CHECK-ARM-ASM: vqrdmlsh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[1]
+// CHECK-AARCH64-ASM: sqrdmlsh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+
+// CHECK-ARM-IR: [[shuffle:%.*]] = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK-ARM-IR: [[mul:%.*]] = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %b, <2 x i32> [[shuffle]])
+// CHECK-ARM-IR: tail call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %a, <2 x i32> [[mul]])
+
+// CHECK-AARCH64-IR: [[shuffle:%.*]] = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+// CHECK-AARCH64-IR: [[mul:%.*]] = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %b, <2 x i32> [[shuffle]])
+// CHECK-AARCH64-IR: tail call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %a, <2 x i32> [[mul]])
   return vqrdmlsh_lane_s32(a, b, c, 1);
 }
 
 // CHECK-LABEL: test_vqrdmlshq_lane_s16
 int16x8_t test_vqrdmlshq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
-// CHECK-ARM: vqrdmlsh.s16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[3]
-// CHECK-AARCH64: sqrdmlsh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+// CHECK-ARM-ASM: vqrdmlsh.s16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[3]
+// CHECK-AARCH64-ASM: sqrdmlsh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+
+// CHECK-ARM-IR: [[shuffle:%.*]] = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-ARM-IR: [[mul:%.*]] = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %b, <8 x i16> [[shuffle]])
+// CHECK-ARM-IR: tail call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %a, <8 x i16> [[mul]])
+
+// CHECK-AARCH64-IR: [[shuffle:%.*]] = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-AARCH64-IR: [[mul:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %b, <8 x i16> [[shuffle]])
+// CHECK-AARCH64-IR: tail call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %a, <8 x i16> [[mul]])
   return vqrdmlshq_lane_s16(a, b, c, 3);
 }
 
 // CHECK-LABEL: test_vqrdmlshq_lane_s32
 int32x4_t test_vqrdmlshq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
-// CHECK-ARM: vqrdmlsh.s32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[1]
-// CHECK-AARCH64: sqrdmlsh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+// CHECK-ARM-ASM: vqrdmlsh.s32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[1]
+// CHECK-AARCH64-ASM: sqrdmlsh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+
+// CHECK-ARM-IR: [[shuffle:%.*]] = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-ARM-IR: [[mul:%.*]] = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %b, <4 x i32> [[shuffle]])
+// CHECK-ARM-IR: tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> [[mul]])
+
+// CHECK-AARCH64-IR: [[shuffle:%.*]] = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-AARCH64-IR: [[mul:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %b, <4 x i32> [[shuffle]])
+// CHECK-AARCH64-IR: tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[mul]])
   return vqrdmlshq_lane_s32(a, b, c, 1);
 }